diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index be6fdf0c..db4dceed 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -22,7 +22,7 @@ jobs: - {os: windows-latest, r: 'release'} - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} - - {os: ubuntu-latest, r: 'oldrel/1'} + - {os: ubuntu-latest, r: 'oldrel-1'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} @@ -43,26 +43,4 @@ jobs: with: extra-packages: rcmdcheck - - name: Check - env: - _R_CHECK_CRAN_INCOMING_: false - run: | - options(crayon.enabled = TRUE) - rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") - shell: Rscript {0} - - - name: Show testthat output - if: always() - run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true - shell: bash - - - name: Test coverage - run: covr::codecov() - shell: Rscript {0} - - - name: Upload check results - if: failure() - uses: actions/upload-artifact@main - with: - name: ${{ runner.os }}-r${{ matrix.config.r }}-results - path: check + - uses: r-lib/actions/check-r-package@v1 diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 00000000..63cbb18a --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,35 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/master/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + release: + types: [published] + workflow_dispatch: + +name: pkgdown + +jobs: + pkgdown: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-pandoc@v1 + + - uses: r-lib/actions/setup-r@v1 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v1 + with: + extra-packages: pkgdown + needs: website + + - name: Deploy package + run: | + git config --local user.name "$GITHUB_ACTOR" + git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml new file mode 100644 index 00000000..3c0da1c9 --- /dev/null +++ b/.github/workflows/test-coverage.yaml @@ -0,0 +1,30 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/master/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: test-coverage + +jobs: + test-coverage: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v1 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v1 + with: + extra-packages: covr + + - name: Test coverage + run: covr::codecov() + shell: Rscript {0} diff --git a/.gitignore b/.gitignore index 1fb91752..fb832c23 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ doc Meta /doc/ /Meta/ +docs diff --git a/DESCRIPTION b/DESCRIPTION index 334a9174..df42262b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: metabolyseR Title: Methods for Pre-Treatment, Data Mining and Correlation Analyses of Metabolomics Data -Version: 0.14.9 +Version: 0.14.10 Authors@R: person("Jasen", "Finch", email = "jsf9@aber.ac.uk", role = c("aut", "cre")) Description: A tool kit for pre-treatment, modelling, feature selection and correlation analyses of metabolomics data. URL: https://jasenfinch.github.io/metabolyseR @@ -76,6 +76,7 @@ Collate: allClasses.R plotting.R plotUnsupervisedRF.R pre-treatment.R + predict.R QC.R reexports.R remove.R @@ -85,6 +86,7 @@ Collate: allClasses.R show-method.R split.R transform.R + tune.R univariate.R modelling-accessors.R VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 40a9c352..9e3324d2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,6 +48,7 @@ export(metabolyse) export(metrics) export(modellingMethods) export(modellingParameters) +export(mtry) export(nFeatures) export(nSamples) export(occupancy) @@ -73,6 +74,7 @@ export(preTreated) export(preTreatmentElements) export(preTreatmentMethods) export(preTreatmentParameters) +export(predict) export(proximity) export(randomForest) export(raw) @@ -97,6 +99,7 @@ export(transformSQRT) export(transformTICnorm) export(transformVast) export(ttest) +export(tune) export(type) exportClasses(Analysis) exportClasses(AnalysisData) @@ -137,6 +140,7 @@ importFrom(dplyr,mutate_if) importFrom(dplyr,n) importFrom(dplyr,relocate) importFrom(dplyr,rename) +importFrom(dplyr,rename_with) importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,select_if) @@ -147,6 +151,7 @@ importFrom(e1071,naiveBayes) importFrom(forestControl,fpr_fs) importFrom(furrr,furrr_options) importFrom(furrr,future_map) +importFrom(furrr,future_map2) importFrom(future,plan) importFrom(ggdendro,dendro_data) importFrom(ggplot2,aes) @@ -210,6 +215,7 @@ importFrom(patchwork,wrap_plots) importFrom(purrr,map) importFrom(purrr,map_chr) importFrom(purrr,map_dbl) +importFrom(purrr,map_depth) importFrom(purrr,map_df) importFrom(purrr,map_lgl) importFrom(purrr,walk) @@ -231,6 +237,7 @@ importFrom(stats,runif) importFrom(stats,sd) importFrom(stringr,str_c) importFrom(stringr,str_extract) +importFrom(stringr,str_remove) importFrom(stringr,str_remove_all) importFrom(stringr,str_replace_all) importFrom(stringr,str_split) @@ -241,6 +248,7 @@ importFrom(tibble,deframe) importFrom(tibble,rowid_to_column) importFrom(tibble,tibble) importFrom(tidyr,drop_na) +importFrom(tidyr,expand_grid) importFrom(tidyr,gather) importFrom(tidyr,spread) importFrom(tidyselect,all_of) diff --git a/NEWS.md b/NEWS.md index badde6a8..7336d921 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# metabolyseR 0.14.10 + +* Added the method [`predict()`](https://jasenfinch.github.io/metabolyseR/reference/predict.html) for the [`RandomForest`](https://jasenfinch.github.io/metabolyseR/reference/RandomForest-class.html) S4 class to predict model response values. + +* Added the method [`mtry()`](https://jasenfinch.github.io/metabolyseR/reference/modelling-accessors.html) for the [`AnalysisData`](https://jasenfinch.github.io/metabolyseR/reference/AnalysisData-class.html) S4 class to return the default `mtry` random forest parameter for a given response variable. + +* Added the method [`tune()`]() for the [`AnalysisData`](https://jasenfinch.github.io/metabolyseR/reference/tune.html) S4 class to tune the random forest parameters `mtry` and `ntree` for a given response variable. + # metabolyseR 0.14.9 * Suppressed name repair console message encountered during random forest permutation testing. diff --git a/R/modelling-accessors.R b/R/modelling-accessors.R index ad11b9d7..3d0b9f60 100644 --- a/R/modelling-accessors.R +++ b/R/modelling-accessors.R @@ -9,6 +9,7 @@ #' @param ... arguments to parse to method for specific class #' @section Methods: #' * `binaryComparisons`: Return a vector of all possible binary comparisons for a given sample information column. +#' * `mtry`: Return the default `mtry` random forest parameter value for a given sample information column. #' * `type`: Return the type of random forest analysis. #' * `response`: Return the response variable name used for a random forest analysis. #' * `metrics`: Retrieve the model performance metrics for a random forest analysis @@ -21,9 +22,12 @@ #' #' d <- analysisData(abr1$neg[,200:300],abr1$fact) #' -#' ## Return possible binary comparisons for the 'day' column +#' ## Return possible binary comparisons for the `day` response column #' binaryComparisons(d,cls = 'day') #' +#' ## Return the default random forest `mtry` parameter for the `day` response column +#' mtry(d,cls = 'day') +#' #' ## Perform random forest analysis #' rf_analysis <- randomForest(d,cls = 'day') #' @@ -70,6 +74,40 @@ setMethod('binaryComparisons',signature = 'AnalysisData', #' @rdname modelling-accessors #' @export +setGeneric("mtry", function(x,cls = 'class') + standardGeneric("mtry")) + +#' @rdname modelling-accessors + +setMethod('mtry',signature = 'AnalysisData', + function(x,cls = 'class'){ + + if (is.null(cls)){ + rf_type <- 'classification' + } else { + response <- x %>% + clsExtract(cls = cls) + + rf_type <- ifelse(is.numeric(response), + 'regression', + 'classification') + } + + n_features <- nFeatures(x) + + mtry <- switch(rf_type, + regression = n_features/3, + classification = sqrt(n_features)) %>% + floor() %>% + c(.,1) %>% + max() + + return(mtry) + }) + +#' @rdname modelling-accessors +#' @export + setGeneric("type", function(x) standardGeneric("type")) diff --git a/R/nlda.R b/R/nlda.R index fdb4f937..670223a1 100644 --- a/R/nlda.R +++ b/R/nlda.R @@ -3,7 +3,7 @@ setGeneric('nlda',function(x,cls = 'class',prior = NULL,scale = FALSE,comprank = FALSE,...) standardGeneric('nlda')) -#' @importFrom e1071 naiveBayes +#' @importFrom e1071 naiveBayes #' @importFrom stats cov predict #' @importFrom methods as @@ -135,7 +135,7 @@ setMethod('nlda',signature = 'AnalysisData', dimnames(xmeans)[[2]] <- colnames(x) nbmod <- naiveBayes(data.frame(x),cl) - prob <- predict(nbmod,data.frame(x),type="raw") + prob <- stats::predict(nbmod,data.frame(x),type="raw") pred <- apply(prob,1,which.max) pred <- factor(levels(cl)[pred], levels = levels(cl)) diff --git a/R/predict.R b/R/predict.R new file mode 100644 index 00000000..0e55c097 --- /dev/null +++ b/R/predict.R @@ -0,0 +1,123 @@ +#' Predict random forest model responses +#' @rdname predict +#' @description Predict values of random forest model response variables from new data. +#' @param model S4 object of class `RandomForest` +#' @param new_data S4 object of class `AnalysisData` +#' @param idx sample information column to use for sample names. If `NULL`, the sample row number will be used. Sample names should be unique for each row of data. +#' @param type one of `response`, `prob`, or `votes` to indicate the type of prediction to make +#' @param ... arguments to pass to `randomForest::predict.randomForest()` +#' @details +#' The features contained within `new_data` should match those of the features used to train `model`. +#' The `features()` method can be used to check this. +#' The argument `returnModels = TRUE` should also be used when training the `RandomForest-class` object used for argument `model`. +#' @examples +#' library(metaboData) +#' +#' ## Prepare some data +#' x <- analysisData(abr1$neg[,200:300],abr1$fact) %>% +#' occupancyMaximum(cls = 'day') %>% +#' transformTICnorm() +#' +#' ## Extract data from which to train a random forest model +#' training_data <- x %>% +#' keepClasses(cls = 'day', +#' classes = c('H','1')) +#' +#' ## Extract data for which response values will be predicted +#' test_data <- x %>% +#' keepClasses(cls = 'day', +#' classes = c('2','3')) +#' +#' rf <- randomForest(training_data, +#' cls = 'day', +#' returnModels = TRUE) +#' +#' predict(rf, +#' test_data) +#' @importFrom purrr map_depth +#' @export + +setGeneric("predict", function(model, + new_data, + idx = NULL, + type = c('response','prob','votes'), + ...) + standardGeneric("predict")) + +#' @rdname predict + +setMethod('predict',signature = c('RandomForest','AnalysisData'), + function(model, + new_data, + idx = NULL, + type = c('response','prob','votes'), + ...){ + + if (type(model) == 'unsupervised') { + stop("Can't predict unsupervised random forest.", + call. = FALSE) + } + + if(length(model@models) == 0){ + stop('No random forest models detected. Use argument `returnModels = TRUE` when running method `randomForest()`.', + call. = FALSE) + } + + if (!is.null(idx)){ + sample_idx <- new_data %>% + clsExtract(cls = idx) + + if (any(duplicated(sample_idx))){ + stop(str_c('Duplicated sample names found in sample information column `', + idx, + '`. The specified sample names should be unique to each sample.'), + call. = FALSE) + } + } else { + sample_idx <- seq_len(nSamples(new_data)) + } + + type <- match.arg(type, + c('response','prob','votes')) + + test_data <- dat(new_data) + + model_object_depth <- switch(type(model), + classification = 4, + regression = 3) + + model_predictions <- model@models %>% + map_depth(.depth = model_object_depth, + .f = ~ .x %>% + { + tibble( + Sample = sample_idx, + Prediction = stats::predict( + object = .x, + newdata = test_data, + type = type, + ...)) + }) %>% + map_depth(.depth = model_object_depth - 2, + .f = ~ .x$models) + + column_headers <- c('Response', + 'Comparison', + 'Rep') + type_column_headers <- switch( + type(model), + classification = column_headers, + regression = column_headers[c(1,3)] + ) + + for (i in rev(type_column_headers)) { + model_predictions <- map_depth(.x = model_predictions, + .depth = which(type_column_headers == i) - 1, + .f = bind_rows,.id = i) + } + + model_predictions <- model_predictions %>% + mutate(Rep = as.numeric(Rep)) + + return(model_predictions) + }) diff --git a/R/tune.R b/R/tune.R new file mode 100644 index 00000000..2a1cc86f --- /dev/null +++ b/R/tune.R @@ -0,0 +1,112 @@ +#' Tune random forest parameters +#' @rdname tune +#' @description Tune the `mtry` and `ntree` random forest parameters using a grid search approach. +#' @param x S4 object of class `AnalysisData` +#' @param cls sample information column to use +#' @param mtry_range numeric vector of `mtry` values to search +#' @param ntree_range numeric vector of `ntree` values to search +#' @param seed random number seed +#' @details +#' Parameter tuning is performed by grid search of all combinations of the `mtry_range` and `ntree_range` vectors provided. +#' The optimal parameter values are selected using the out-of-bag error estimates of the `margin` metric for classification and the `rmse` (root-mean-square error) metric for regression. +#' @return +#' A list containing the optimal `mtry` and `ntree` parameters. +#' This is suitable for use as the `rf` argument in method `randomForest()`. +#' @examples +#' library(metaboData) +#' +#' ## Prepare some data +#' x <- analysisData(abr1$neg[,200:300],abr1$fact) %>% +#' occupancyMaximum(cls = 'day') %>% +#' transformTICnorm() +#' +#' ## Tune the `mtry` parameter for the `day` response +#' tune(x,cls = 'day') +#' @export + +setGeneric("tune", function(x, + cls = 'class', + mtry_range = floor(seq(mtry(x,cls = cls) - mtry(x,cls = cls)/2, + mtry(x,cls = cls) + mtry(x,cls = cls)/2, + length.out = 4)), + ntree_range = 1000, + seed = 1234) + standardGeneric("tune")) + +#' @rdname tune +#' @importFrom tidyr expand_grid +#' @importFrom dplyr rename_with +#' @importFrom stringr str_remove +#' @importFrom furrr future_map2 + +setMethod('tune',signature = 'AnalysisData', + function(x, + cls = 'class', + mtry_range = floor(seq(mtry(x,cls = cls) - mtry(x,cls = cls)/2, + mtry(x,cls = cls) + mtry(x,cls = cls)/2, + length.out = 4)), + ntree_range = 1000, + seed = 1234){ + + if (is.null(cls)){ + stop("Can't tune unsupervised random forest.", + call. = FALSE) + } + + response <- clsExtract(x,cls = cls) + + rf_type <- ifelse(is.numeric(response), + 'regression', + 'classification') + + metric <- switch(rf_type, + regression = 'rmse', + classification = 'margin') + + combinations <- expand_grid(mtry_range, + ntree_range) %>% + rename_with(~ str_remove(.x, + '_range')) + + search_results <- combinations %>% + { + future_map2( + .$ntree, + .$mtry, + .f = ~{ + rf_res <- try(randomForest(x, + cls = cls, + rf = list(ntree = .x, + mtry = .y)), + silent = TRUE) + if (class(rf_res) == 'RandomForest'){ + rf_res %>% + metrics() %>% + select(-Response,-.estimator,-contains('Comparison')) %>% + spread(.metric,.estimate) %>% + mutate(ntree = .x, + mtry = .y) + } else { + NULL + } + + }, + .options = furrr_options(seed = seed)) + } %>% + bind_rows() + + if (nrow(search_results) > 0){ + search_results <- switch(metric, + rmse = search_results %>% + arrange(!!sym(metric)) , + margin = search_results %>% + arrange(desc(!!sym(metric)))) %>% + {list(mtry = .$mtry[1], + ntree = .$ntree[1])} + + return(search_results) + } else { + return(list()) + } + + }) diff --git a/_pkgdown.yml b/_pkgdown.yml index a475f749..62bdfe16 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,6 +1,6 @@ -destination: docs - url: https://jasenfinch.github.io/metabolyseR/ +template: + bootstrap: 5 navbar: components: @@ -55,8 +55,10 @@ reference: - ttest - linearRegression - binaryComparisons + - tune - mds - roc + - predict - title: Correlations contents: @@ -84,4 +86,3 @@ reference: - split - rsd - occupancy - \ No newline at end of file diff --git a/docs/404.html b/docs/404.html deleted file mode 100644 index 44d3a98f..00000000 --- a/docs/404.html +++ /dev/null @@ -1,121 +0,0 @@ - - -
- - - - -vignettes/01_quick_start.Rmd
- 01_quick_start.Rmd
This example analysis will use the abr1
data set from the metaboData package. It is nominal mass flow-injection mass spectrometry (FI-MS) fingerprinting data from a plant-pathogen infection time course experiment. The analysis will also include use of the pipe %>%
from the magrittr package. First load the necessary packages.
For this example we will use only the negative acquisition mode data (abr1$neg
) and sample meta-information (abr1$fact
). Create an AnalysisData
class object using the following:
-d <- analysisData(abr1$neg,abr1$fact)
The data includes 120 samples and 2000 mass spectral features as shown below.
-
-d
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 2000
-#> Info: 9
The clsAvailable()
function can be used to identify the columns available in our meta-information table.
-clsAvailable(d)
-#> [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name" "rep"
-#> [8] "day" "class"
For this analysis, we will be using the infection time course class information contained in the day
column. This can be extracted and the class frequencies tabulated using the following:
-d %>%
- clsExtract(cls = 'day') %>%
- table()
-#> .
-#> 1 2 3 4 5 H
-#> 20 20 20 20 20 20
As can be seen above, the experiment is made up of six infection time point classes that includes a healthy control class (H
) and five day infection time points (1-5
), each with 20 replicates.
For data pre-treatment prior to statistical analysis, a two-thirds maximum class occupancy filter can be applied. Features where the maximum proportion of non-missing data per class is above two-thirds are retained. A total ion count normalisation will also be applied.
-
-d <- d %>%
- occupancyMaximum(cls = 'day', occupancy = 2/3) %>%
- transformTICnorm()
-d
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 1760
-#> Info: 9
This has reduced the data set to 1760 relevant features.
-The structure of the data can be visualised using both unsupervised and supervised methods. For instance, the first two principle components from a principle component analysis (PCA) of the data with the sample points coloured by infection class can be plotted using:
-
-plotPCA(d,cls = 'day',xAxis = 'PC1',yAxis = 'PC2')
And similarly, multidimensional scaling (MDS) of sample proximity values from a supervised random forest classification model along with receiver operator characteristic (ROC) curves.
-
-plotSupervisedRF(d,cls = 'day')
A progression can clearly be seen from the earliest to latest infected time points.
-For feature selection, one-way analysis of variance (ANOVA) can be performed for each feature to identify features significantly explanatory for the infection time point.
-
-anova_results <- d %>%
- anova(cls = 'day')
A table of the significantly explanatory features can be extracted with a bonferroni correction adjusted p value < 0.05 using:
-
-explan_feat <- explanatoryFeatures(anova_results,threshold = 0.05)
-explan_feat
-#> # A tibble: 379 × 10
-#> Response Comparison Feature term df sumsq meansq statistic p.value
-#> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 day 1~2~3~4~5~H N341 response 5 3.88e-4 7.76e-5 137. 1.55e-46
-#> 2 day 1~2~3~4~5~H N133 response 5 7.00e-5 1.40e-5 126. 8.63e-45
-#> 3 day 1~2~3~4~5~H N163 response 5 6.01e-5 1.20e-5 117. 2.95e-43
-#> 4 day 1~2~3~4~5~H N1087 response 5 2.42e-6 4.84e-7 99.8 5.61e-40
-#> 5 day 1~2~3~4~5~H N171 response 5 2.25e-7 4.50e-8 95.7 3.84e-39
-#> 6 day 1~2~3~4~5~H N513 response 5 3.38e-6 6.76e-7 95.3 4.78e-39
-#> 7 day 1~2~3~4~5~H N1025 response 5 2.78e-6 5.56e-7 91.0 3.91e-38
-#> 8 day 1~2~3~4~5~H N342 response 5 3.71e-6 7.41e-7 90.3 5.32e-38
-#> 9 day 1~2~3~4~5~H N1083 response 5 5.11e-5 1.02e-5 89.0 1.06e-37
-#> 10 day 1~2~3~4~5~H N1085 response 5 1.10e-5 2.19e-6 83.4 1.92e-36
-#> # … with 369 more rows, and 1 more variable: adjusted.p.value <dbl>
The ANOVA has identified 379 features significantly explanatory over the infection time course. A heat map of the mean relative intensity for each class of these explanatory features can be plotted to visualise their trends between the infection time point classes.
-
-plotExplanatoryHeatmap(anova_results,
- threshold = 0.05,
- featureNames = FALSE)
Many of the explanatory features can be seen to be most highly abundant in the final infection time point 5
.
Finally, box plots of the trends of individual features can be plotted, such as the N341
feature below.
-plotFeature(anova_results,feature = 'N341',cls = 'day')
vignettes/02_introduction.Rmd
- 02_introduction.Rmd
The metabolyseR package provides a suite of methods that encompass three elements of metabolomics data analysis:
-The package also distinguishes between the flexibility and simplicity required for exploratory analyses compared to the convenience needed for more complex routine analyses. This is reflected in the underlying S4 object-oriented implementations and associated methods defined within the package. It should be noted that it is useful to understand the principles involved in using metabolyseR for exploratory analyses to aid in extracting and wrangling the results generated from routine analyses.
-The following document will provide an introduction to the basic usage of the package and includes how to create and use the base classes that are the foundation of metabolyseR. This will be focused around the applications for both exploratory and routine analyses. For more detailed information on the individual analysis elements see their associated vignette using:
-
-browseVignettes('metabolyseR')
There is also an example quick start analysis vignette provided.
-
-vignette('quick_start','metabolyseR')
Any issues, bugs or errors encountered while using the package should be reported here.
-The examples shown here will use the abr1
data set from the metaboData package (?metaboData::abr1
). This is a nominal mass flow-injection mass spectrometry (FI-MS) fingerprinting data set from a plant-pathogen infection time course experiment. The examples will also include use of the pipe %>%
from the magrittr package.
Firstly load the necessary packages:
- -The package supports parallel processing using the future package.
-By default, processing by metabolyseR
will be done sequentially. However, parallel processing can be activated, prior to analysis, by specifying a parallel back-end using plan()
. The following example specifies using the multisession
implementation (multiple background R sessions) with two worker processes.
-plan(future::multisession,workers = 2)
See the future package documentation for more information on the types of parallel implementations that are available.
-For exploratory analyses, simple questions of the data need to be answered quickly, requiring few steps. Key requirements for any tool used by investigators are that it should be both simple and flexible.
-In metabolyseR, the AnalysisData
class is the base S4 class that provides these requirements. The following sections will give an overview of the basics in constructing and using these objects as the base for analysis.
We can firstly construct an AnalysisData
object which requires two data tables. The first is the metabolomic data where the columns are the metabolome features, the rows the sample observations and contains the abundance values. The second is the sample meta-information where the row order should match to that of the metabolome data table. Using the example data, his can be constructed and assigned to the variable d
by:
-d <- analysisData(data = abr1$neg,
- info = abr1$fact)
Where abr1$neg
is the negative ionisation mode data and abr1$fact
is the corresponding sample information. By printing d
we can view some basic information about our data.
-print(d)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 2000
-## Info: 9
-We can also return the numbers of samples and numbers of features respectively using the following:
-
-nSamples(d)
## [1] 120
-
-nFeatures(d)
## [1] 2000
-The data table can be extracted using the dat
method:
-dat(d)
## # A tibble: 120 × 2,000
-## N1 N2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N12 N13
-## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 7 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 8 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 9 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 10 0 0 0 0 0 0 0 0 0 0 0 0 0
-## # … with 110 more rows, and 1,987 more variables: N14 <dbl>, N15 <dbl>,
-## # N16 <dbl>, N17 <dbl>, N18 <dbl>, N19 <dbl>, N20 <dbl>, N21 <dbl>,
-## # N22 <dbl>, N23 <dbl>, N24 <dbl>, N25 <dbl>, N26 <dbl>, N27 <dbl>,
-## # N28 <dbl>, N29 <dbl>, N30 <dbl>, N31 <dbl>, N32 <dbl>, N33 <dbl>,
-## # N34 <dbl>, N35 <dbl>, N36 <dbl>, N37 <dbl>, N38 <dbl>, N39 <dbl>,
-## # N40 <dbl>, N41 <dbl>, N42 <dbl>, N43 <dbl>, N44 <dbl>, N45 <dbl>,
-## # N46 <dbl>, N47 <dbl>, N48 <dbl>, N49 <dbl>, N50 <dbl>, N51 <dbl>, …
-Or alternatively, can be used to assign a new data table:
-
-dat(d) <- abr1$pos
-d
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 2000
-## Info: 9
-The sample information table can be extracted using the sinfo
method:
-sinfo(d)
## # A tibble: 120 × 9
-## injorder pathcdf filecdf name.org remark name rep day class
-## <int> <fct> <fct> <fct> <fct> <fct> <int> <fct> <int>
-## 1 1 C:/Xcalibur/ANDI-LT… 01.cdf 12_2 ok 12_2 2 2 2
-## 2 2 C:/Xcalibur/ANDI-LT… 02.cdf 13_3 ok 13_4 3 3 3
-## 3 3 C:/Xcalibur/ANDI-LT… 03.cdf 15_4 ok 15_5 5 4 4
-## 4 4 C:/Xcalibur/ANDI-LT… 04.cdf 12_1 ok 12_2 2 1 1
-## 5 5 C:/Xcalibur/ANDI-LT… 05.cdf 12_2 ok 12_2 2 2 2
-## 6 6 C:/Xcalibur/ANDI-LT… 06.cdf 11_1 ok 11_2 1 1 1
-## 7 7 C:/Xcalibur/ANDI-LT… 07.cdf 14_2 ok 14_3 4 2 2
-## 8 8 C:/Xcalibur/ANDI-LT… 08.cdf 11_4 ok 11_5 1 4 4
-## 9 9 C:/Xcalibur/ANDI-LT… 09.cdf 13_H ok 13_H 3 H 6
-## 10 10 C:/Xcalibur/ANDI-LT… 10.cdf 15_H ok 15_H 5 H 6
-## # … with 110 more rows
-And similarly used to assign a new sample information table:
-
-sinfo(d) <- abr1$fact[,1:2]
-d
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 2000
-## Info: 2
-There are a number of methods that provide utility for querying and altering the sample information within an AnalysisData
object. These methods are all named with the prefix cls
and include:
clsAdd
clsArrange
clsAvailable
clsExtract
clsRemove
clsRename
clsReplace
The names of the available sample information columns can be shown using clsAvailable()
.
-clsAvailable(d)
## [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name" "rep"
-## [8] "day" "class"
-A given column can be extracted using clsExtract()
. Here, the day
column is extracted.
-clsExtract(d,cls = 'day')
## [1] 2 3 4 1 2 1 2 4 H H 4 5 1 2 H 5 3 3 2 H 4 3 5 4 H H 3 H H 1 1 1 5 5 3 4 H
-## [38] 1 5 5 1 2 4 3 2 4 3 2 5 4 4 H 3 4 2 4 4 1 5 4 4 1 1 H 3 2 H 3 3 1 2 H H 2
-## [75] 3 5 3 2 5 2 4 3 H 2 3 2 1 1 4 5 3 2 1 H 5 2 4 H 1 4 4 1 1 5 H 5 1 3 3 5 5
-## [112] 5 3 2 5 H 5 H 2 1
-## Levels: 1 2 3 4 5 H
-Sample class frequencies could then be computed.
-
-clsExtract(d,cls = 'day') %>%
- table()
## .
-## 1 2 3 4 5 H
-## 20 20 20 20 20 20
-It can be seen that there are 20 samples available in each class.
-Another example is the addition of a new sample information column. In the following, a column called new_class
will be added with all samples labelled 1
.
-d <- clsAdd(d,cls = 'new_class',value = rep(1,nSamples(d)))
-clsAvailable(d)
## [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name"
-## [7] "rep" "day" "class" "new_class"
-Samples or features can easily be kept or removed from an AnalysisData
object as is most convenient.
Below can be seen the first 6 sample indexes in the injorder
column of the sample information.
-samples <- d %>%
- clsExtract(cls = 'injorder') %>%
- head()
-
-print(samples)
## [1] 1 2 3 4 5 6
-Only these samples could be kept using:
-
-d %>%
- keepSamples(idx = 'injorder',samples = samples)
##
-## AnalysisData object containing:
-##
-## Samples: 6
-## Features: 2000
-## Info: 10
-Or removed using:
-
-d %>%
- removeSamples(idx = 'injorder',samples = samples)
##
-## AnalysisData object containing:
-##
-## Samples: 114
-## Features: 2000
-## Info: 10
-The process is very similar for keeping or removing specific metabolome features from the data table. Below can be seen the first 6 feature names in the data table.
- -## [1] "N1" "N2" "N3" "N4" "N5" "N6"
-Only these features can be kept using:
-
-d %>%
- keepFeatures(features = feat)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 6
-## Info: 10
-Or to remove these features:
-
-d %>%
- removeFeatures(features = feat)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 1994
-## Info: 10
-Routine analyses are those that are often made up of numerous steps where parameters have likely already been previously established. The emphasis here is on convenience with as little code as possible required. In these analyses, the necessary analysis elements, order and parameters are first prepared and then the analysis routine subsequently performed in a single step. This section will introduce how this type of analysis can be performed using metabolyseR and will include four main topics:
-Parameter selection is the fundamental aspect for performing routine analyses using metabolyseR and will be the step requiring the most input from the user. The parameters for an analysis are stored in an S4 object of class AnalysisParameters
containing the relevant parameters of the selected analysis elements.
The parameters have been named so that they denote the same functionality commonly across all analysis element methods. Discussion of the specific parameters can be found withing the vignettes of the relevant analysis elements. These can be accessed using:
-
-browseVignettes('metabolyseR')
There are several ways to specify the parameters to use for analysis. The first is programatically and the second is through the use of the YAML format.
-The available analysis elements can be shown using:
- -## [1] "pre-treatment" "modelling" "correlations"
-The analysisParameters()
function can be used to create an AnalysisParameters
object containing the default parameters. For example, the code below will return default parameters for all the metabolyseR analysis elements.
-p <- analysisParameters()
-p
## Parameters:
-## pre-treatment
-## QC
-## occupancyFilter
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## impute
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## parallel = variables
-## seed = 1234
-## RSDfilter
-## cls = class
-## QCidx = QC
-## RSDthresh = 50
-## removeQC
-## cls = class
-## QCidx = QC
-## occupancyFilter
-## maximum
-## cls = class
-## occupancy = 2/3
-## impute
-## class
-## cls = class
-## occupancy = 2/3
-## seed = 1234
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = class
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-##
-## correlations
-## method = pearson
-## pAdjustMethod = bonferroni
-## corPvalue = 0.05
-To retrieve parameters for a subset of analysis elements the following can be run, returning parameters for only the pre-treatment and modelling elements.
-
-p <- analysisParameters(c('pre-treatment','modelling'))
-p
## Parameters:
-## pre-treatment
-## QC
-## occupancyFilter
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## impute
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## parallel = variables
-## seed = 1234
-## RSDfilter
-## cls = class
-## QCidx = QC
-## RSDthresh = 50
-## removeQC
-## cls = class
-## QCidx = QC
-## occupancyFilter
-## maximum
-## cls = class
-## occupancy = 2/3
-## impute
-## class
-## cls = class
-## occupancy = 2/3
-## seed = 1234
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = class
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-The changeParameter()
function can be used to uniformly change these parameters across all of the selected methods. The example below changes the defaults of all the parameters named cls
from the default class
to day
.
-p <- analysisParameters()
-changeParameter(p,'cls') <- 'day'
-p
## Parameters:
-## pre-treatment
-## QC
-## occupancyFilter
-## cls = day
-## QCidx = QC
-## occupancy = 2/3
-## impute
-## cls = day
-## QCidx = QC
-## occupancy = 2/3
-## parallel = variables
-## seed = 1234
-## RSDfilter
-## cls = day
-## QCidx = QC
-## RSDthresh = 50
-## removeQC
-## cls = day
-## QCidx = QC
-## occupancyFilter
-## maximum
-## cls = day
-## occupancy = 2/3
-## impute
-## class
-## cls = day
-## occupancy = 2/3
-## seed = 1234
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = day
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-##
-## correlations
-## method = pearson
-## pAdjustMethod = bonferroni
-## corPvalue = 0.05
-Alternatively the parameters of a specific analysis elements can be targeted using the elements
argument. The following will only alter the cls
parameter back to class
for the pre-treatment element parameters:
-changeParameter(p,'cls',elements = 'pre-treatment') <- 'class'
Parameters can be extracted from the AnalysisParameters
class using the parameters()
function for a specified element.
-parameters(p,'correlations')
## $method
-## [1] "pearson"
-##
-## $pAdjustMethod
-## [1] "bonferroni"
-##
-## $corPvalue
-## [1] 0.05
-Each analysis element has a function for returning default parameters for specific methods. These include preTreatmentParameters()
, modellingParameters()
and correlationParameters()
. Each returns a list of the default parameters for a specified methods as shown in the example for modellingParameters()
below.
-modellingParameters('anova')
## $anova
-## $anova$cls
-## [1] "class"
-##
-## $anova$pAdjust
-## [1] "bonferroni"
-##
-## $anova$comparisons
-## list()
-##
-## $anova$returnModels
-## [1] FALSE
-Refer to the documentation (?
) of each function for sepecific usage details.
The parameters returned by these functions can be assigned to an AnalysisParameters
object, again using parameters()
’
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(
- occupancyFilter = 'maximum',
- transform = 'TICnorm'
- )
- )
Due to the relatively complex structure of the parameters needed for analyses containing many components, it is also possible to specify analysis parameters using the YAML file format. YAML parameter files (.yaml) can be parsed using the parseParameters()
function. The example below shows the YAML specification for the defaults returned by analysisParameters()
.
pre-treatment:
- QC:
- occupancyFilter:
- cls: class
- QCidx: QC
- occupancy: 0.667
- impute:
- cls: class
- QCidx: QC
- occupancy: 0.667
- RSDfilter:
- cls: class
- QCidx: QC
- RSDthresh: 0.5
- removeQC:
- cls: class
- QCidx: QC
- occupancyFilter:
- maximum:
- cls: class
- occupancy: 0.667
- impute:
- class:
- cls: class
- occupancy: 0.667
- nCores: 4
- clusterType: FORK
- transform:
- TICnorm: ~
-classification:
- cls: class
- method: randomForest
- pars:
- sampling: boot
- niter: 10
- nreps: 10
- strat: yes
- nCores: 4
- clusterType: Fork
-featureSelection:
- method: fs.rf
- cls: class
- pars:
- fs.rf:
- nreps: 100
- nCores: 4
- clusterType: FORK
-correlations:
- method: pearson
- pAdjustMethod: bonferroni
- corPvalue: 0.05
This can be passed directly into an AnalysisParameters
object using the following:
-paramFile <- system.file('defaultParameters.yaml',package = 'metabolyseR')
-p <- parseParameters(paramFile)
For more complex pre-treatment situations such as the following:
-pre-treatment:
- remove:
- sample:
- idx: fileOrder
- samples: 1
- remove1:
- class:
- cls: day
- classes:
- - H
- - 1
- occupancyFilter:
- maximum:
- cls: class
- occupancy: 0.667
- transform:
- TICnorm: ~
Where multiple steps of the same method needed (here is remove
), these are numbered sequentially. Where multiple values also need to be provided to a particular argument (e.g. classes = c('H','1')
), these should be supplied as a hyphenated list.
Existing AnalysisParameters
objects can also be exported to YAML format as shown below:
-p <- analysisParameters()
-exportParameters(p,file = 'analysis_parameters.yaml')
The analysis is performed in a single step using the metabolyse()
function. This accepts the metabolomic data, the sample information and the analysis parameters.
The metabolomic data table of abundance values where the columns are the metabolome features and the rows are each sample observation. Similarly, the sample meta-information table should consist of the observations as rows and the meta information as columns. The order of the observation rows of the sample information table should be concordant with the rows in the metabolomics data table.
-We can run an example analysis using the abr1
data set by first generating the default parameters for pre-treatment and modelling (random forest) analysis elements.
-p <- analysisParameters(c('pre-treatment','modelling'))
Custom pre-treatment parameters can then be specified to only inlude occupancy filtering and total ion count normalisation.
-
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(
- occupancyFilter = 'maximum',
- transform = 'TICnorm')
-)
Next the cls
parameters can be changed to use the day
sample information column throughout the analysis.
-changeParameter(p,'cls') <- 'day'
Finally, the analysis can be run in a single step. Here only the fist 200 features of the negative ionisation mode data are specified to reduce the analysis time needed for this example.
-
-analysis <- metabolyse(abr1$neg[,1:200],abr1$fact,p)
##
-## metabolyseR v0.14.3 Tue Sep 14 10:08:22 2021
-## ________________________________________________________________________________
-## Parameters:
-## pre-treatment
-## occupancyFilter
-## maximum
-## cls = day
-## occupancy = 2/3
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = day
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-## ________________________________________________________________________________
-## Pre-treatment …
-
-Pre-treatment ✓ [0.9S]
-## Modelling …
-
-Modelling ✓ [3.7S]
-## ________________________________________________________________________________
-##
-## Complete! [4.6S]
-Note: If a data pre-treatment step is not performed prior to modelling or correlation analysis, the raw data will automatically be used.
-The analysis
object containing the analysis results can be printed to provide some basic information about the results of the analysis.
-print(analysis)
##
-## metabolyseR v0.14.3
-## Analysis:
-## Tue Sep 14 10:08:22 2021
-##
-## Raw Data:
-## No. samples = 120
-## No. features = 200
-##
-## Pre-treated Data:
-## Tue Sep 14 10:08:22 2021
-## No. samples = 120
-## No. features = 48
-##
-## Modelling:
-## Tue Sep 14 10:08:26 2021
-## Methods: randomForest
-There are likely to be occasions where an analysis will need to be re-analysed using a new set of parameters. This can be achieved using the reAnalyse()
function.
In the example below we will run a correlation analysis in addition to the pre-treatment and modelling elements already performed.
-Firstly, we can specify the correlation parameters:
-
-parameters <- analysisParameters('correlations')
Then perform the re-analysis on our previously analysed Analysis
object, specifying the additional parameters.
-analysis <- reAnalyse(analysis,parameters)
##
-## metabolyseR v0.14.3 Tue Sep 14 10:08:26 2021
-## ________________________________________________________________________________
-## Parameters:
-## correlations
-## method = pearson
-## pAdjustMethod = bonferroni
-## corPvalue = 0.05
-## ________________________________________________________________________________
-## Correlations …
-
-Correlations ✓ [0.1S]
-## ________________________________________________________________________________
-##
-## Complete! [0.1S]
-An overview of the results of the analysis (now including correlations) can then be printed.
-
-print(analysis)
##
-## metabolyseR v0.14.3
-## Analysis:
-## Tue Sep 14 10:08:22 2021
-##
-## Raw Data:
-## No. samples = 120
-## No. features = 200
-##
-## Pre-treated Data:
-## Tue Sep 14 10:08:22 2021
-## No. samples = 120
-## No. features = 48
-##
-## Modelling:
-## Tue Sep 14 10:08:26 2021
-## Methods: randomForest
-##
-## Correlations:
-## Tue Sep 14 10:08:27 2021
-## No. correlations = 140
-An analysis performed by metabolyse()
returns an S4 object of class Analysis
. There are a number of ways of extracting analysis results from this object.
Similarly to the AnalysisData
class, the dat()
and sinfo()
functions can be used to extract the metabolomics data or sample information tables directly for either the raw
or pre-treated
data.
For example, to extract the pre-treated metabolomics data from our object analysis
:
-dat(analysis,type = 'pre-treated')
## # A tibble: 120 × 48
-## N113 N115 N117 N118 N119 N127 N128 N129 N130 N131
-## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 0.00646 0 1.68e-4 0 1.60e-3 0.0323 2.65e-4 2.80e-4 0 0
-## 2 0.0113 7.74e-4 1.02e-3 0 1.43e-3 0.00856 0 3.95e-4 0 0
-## 3 0.00931 6.01e-4 2.70e-3 6.22e-5 5.58e-3 0 0 1.05e-4 0 6.51e-4
-## 4 0.00798 0 0 0 1.62e-4 0.00848 0 4.05e-4 0 1.28e-4
-## 5 0.0105 0 0 0 0 0.00658 0 1.97e-3 0 0
-## 6 0.00454 0 2.48e-4 3.25e-4 5.31e-4 0.00207 0 1.98e-4 0 0
-## 7 0.0117 0 1.14e-3 0 4.39e-4 0.00603 0 4.04e-4 0 0
-## 8 0.00787 2.36e-3 1.43e-3 1.52e-4 4.22e-3 0.00290 2.78e-4 5.76e-5 0 0
-## 9 0.00136 1.87e-4 8.17e-4 1.87e-4 0 0.0610 1.31e-4 5.23e-4 0 0
-## 10 0.00899 4.26e-4 2.06e-3 0 8.36e-4 0.00106 7.72e-4 0 0 0
-## # … with 110 more rows, and 38 more variables: N132 <dbl>, N133 <dbl>,
-## # N134 <dbl>, N135 <dbl>, N136 <dbl>, N137 <dbl>, N139 <dbl>, N143 <dbl>,
-## # N145 <dbl>, N146 <dbl>, N147 <dbl>, N149 <dbl>, N153 <dbl>, N155 <dbl>,
-## # N157 <dbl>, N161 <dbl>, N163 <dbl>, N164 <dbl>, N165 <dbl>, N168 <dbl>,
-## # N169 <dbl>, N170 <dbl>, N171 <dbl>, N173 <dbl>, N174 <dbl>, N175 <dbl>,
-## # N179 <dbl>, N180 <dbl>, N181 <dbl>, N183 <dbl>, N187 <dbl>, N191 <dbl>,
-## # N192 <dbl>, N193 <dbl>, N195 <dbl>, N196 <dbl>, N197 <dbl>, N198 <dbl>
-Or to extract the raw sample information:
-
-sinfo(analysis,type = 'raw')
## # A tibble: 120 × 9
-## injorder pathcdf filecdf name.org remark name rep day class
-## <int> <fct> <fct> <fct> <fct> <fct> <int> <fct> <int>
-## 1 1 C:/Xcalibur/ANDI-LT… 01.cdf 12_2 ok 12_2 2 2 2
-## 2 2 C:/Xcalibur/ANDI-LT… 02.cdf 13_3 ok 13_4 3 3 3
-## 3 3 C:/Xcalibur/ANDI-LT… 03.cdf 15_4 ok 15_5 5 4 4
-## 4 4 C:/Xcalibur/ANDI-LT… 04.cdf 12_1 ok 12_2 2 1 1
-## 5 5 C:/Xcalibur/ANDI-LT… 05.cdf 12_2 ok 12_2 2 2 2
-## 6 6 C:/Xcalibur/ANDI-LT… 06.cdf 11_1 ok 11_2 1 1 1
-## 7 7 C:/Xcalibur/ANDI-LT… 07.cdf 14_2 ok 14_3 4 2 2
-## 8 8 C:/Xcalibur/ANDI-LT… 08.cdf 11_4 ok 11_5 1 4 4
-## 9 9 C:/Xcalibur/ANDI-LT… 09.cdf 13_H ok 13_H 3 H 6
-## 10 10 C:/Xcalibur/ANDI-LT… 10.cdf 15_H ok 15_H 5 H 6
-## # … with 110 more rows
-Alternatively the raw
or preTreated
functions can be used to extract the AnalysisData
class objects containing both the metabolomics data and sample information for the raw and pre-treated data respectively.
-raw(analysis)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 200
-## Info: 9
-
-preTreated(analysis)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 48
-## Info: 9
-Lastly the analysisResults
function can be used to extract the results of any of the analysis elements. The following will extract the modelling results:
-analysisResults(analysis,element = 'modelling')
## $randomForest
-##
-## Random forest classification
-##
-## Samples: 120
-## Features: 48
-## Response: day
-## # comparisons: 1
-vignettes/03_pre_treatment.Rmd
- 03_pre_treatment.Rmd
Metabolomics data from any analytical technique requires various data pre-treatment steps prior to subsequent data mining or other downstream analyses. This aids both the data quality and integrity. It is important that appropriate pre-treatment strategies are used not only for the analytical technique being applied but are also suitable for the statistical or machine learning analyses that are to be utilised. Careful consideration of the pre-treatment steps to be undertaken are required as they can have a substantial influence on the results and inferences taken from metabolomic analyses.
-Data pre-treatment is the most faceted aspect of the analysis elements in metabolyseR. It is itself made up of a number of elements, which themselves are made up of methods. The following document will outline the application of each of these pre-treatment elements for use in exploratory analyses then outline how to apply them in routine analyses. For an introduction to the usage of metabolyseR for both exploratory and routine analyses, see the introduction vignette using:
-
-vignette('introduction','metabolyseR')
To further supplement this document, a quick start example analysis is also available as a vignette:
-
-vignette('quick_start','metabolyseR')
To begin, the package can be loaded using:
-
-library(metabolyseR)
-#>
-#> Attaching package: 'metabolyseR'
-#> The following object is masked from 'package:stats':
-#>
-#> anova
-#> The following objects are masked from 'package:base':
-#>
-#> raw, split
The examples used here will use the abr1
data set from the metaboData package. This is nominal mass flow-injection mass spectrometry (FI-MS) fingerprinting data from a plant-pathogen infection time course experiment. The pipe %>%
from the magrittr package will also be used. The example data can be loaded using:
Only the negative acquisition mode data (abr1$neg
) will be used along with the sample meta-information (abr1$fact
). Create an AnalysisData
class object, assigned to the variable d
, using the following:
-d <- analysisData(abr1$neg,abr1$fact)
-print(d)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 2000
-#> Info: 9
As can be seen above the data set contains a total of 120 samples and 2000 features.
-The package supports parallel processing using the future package.
-By default, processing by metabolyseR
will be done seqentially. However, parallel processing can be activated, prior to analysis, by specifying a parallel implementation using plan()
. The following example specifies using the multisession
implementation (muliple background R sessions) with two worker processes.
-plan(future::multisession,workers = 2)
See the future package documentation for more information on the types of parallel implementations that are available.
-The following sections will outline the numerous pre-treatment elements available within metabolyseR. There will be examples of their application during exploratory analyses along with useful visualisations. These can aid interpretation of when particular treatments should be applied as well as their effect once they have been used.
-In many situations, it will be necessary to exclude either individual samples, sample classes or certain features from further analysis.
-Individual samples can be removed using removeSamples()
as below, where the idx
argument stipulates the sample information column cotaining the sample indexes and the samples
argument a vector of sample indexes to remove.
-d %>%
- removeSamples(idx = 'injorder',samples = 1)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 119
-#> Features: 2000
-#> Info: 9
The removeClasses
function can be used similarly to remove whole classes from further analysis:
-d %>%
- removeClasses(cls = 'day',classes = 'H')
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 100
-#> Features: 2000
-#> Info: 9
The following will enable the removal of specified features as a vector supplied to the features
argument:
-d %>%
- removeFeatures(features = c('N1','N2'))
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 1998
-#> Info: 9
There could be occasions where the numbers of samples, classes or features to remove are greater than the numbers of samples, classes or features that are to be retained. In these situations it will be more convenient to directly specify the samples, classes or features to retain. Keeping samples, classes or features is outlined in the following section.
-Often it will be necessary to retain only particular samples, sample classes or certain features for further analysis.
-Individual samples can be kept using keepSamples()
as below, where the idx
argument stipulates the sample information column cotaining the sample indexes and the samples
argument, a vector of sample indexes to keep.
-d %>%
- keepSamples(idx = 'injorder',samples = 1)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 1
-#> Features: 2000
-#> Info: 9
The keepClasses()
method can be used similarly to keep whole classes for further analysis:
-d %>%
- keepClasses(cls = 'day',classes = 'H')
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 20
-#> Features: 2000
-#> Info: 9
The following will specify features to keep, with a vector of feature names supplied to the features
argument:
-d %>%
- keepFeatures(features = c('N1','N2'))
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 2
-#> Info: 9
There are likely to be occasions where the numbers of samples, classes or features to keep are greater than the numbers of samples, classes or features that are to be excluded. In these situations it will be more convenient to directly specify the samples, classes or features to remove. Removing samples, classes or features is outlined in the previous section.
-Occupancy provides a useful metric by which to filter poorly represented features (features containing a majority zero or missing values). An occupancy threshold provides a means of specifying this majority with variables below the threshold excluded from further analyses. However, this can be complicated by an underlying class structure present within the data where a variable may be well represented within one class but not in another.
-The proportional occupancy for each feature within a data set for a given class structure can be calculated using the occupancy()
method, specifying the sample information column using the cls
argument.
-d %>%
- occupancy(cls = 'day')
-#> # A tibble: 11,914 × 5
-#> day Feature N `Class total` Occupancy
-#> <fct> <chr> <dbl> <int> <dbl>
-#> 1 1 N1 0 20 0
-#> 2 1 N10 0 20 0
-#> 3 1 N100 0 20 0
-#> 4 1 N1000 20 20 1
-#> 5 1 N1001 20 20 1
-#> 6 1 N1002 20 20 1
-#> 7 1 N1003 20 20 1
-#> 8 1 N1004 20 20 1
-#> 9 1 N1005 20 20 1
-#> 10 1 N1006 20 20 1
-#> # … with 11,904 more rows
Alternatively the occupancy distributions can be plotted providing a useful overview of the data set:
-
-d %>%
- plotOccupancy(cls = 'day')
It can be seen that there are a number of unoccupied features across all the sample classes with a small rise in the density distribution near 0.
-There are two strategies for thresholding occupancy. The first is a maximum theshold; where the maximum occupancy across all classes is above the threshold. Therefore, for a feature to be retained, only a single class needs to have an occupancy above the threshold. It is this strategy that will be appropriate for most applications. A two-thirds maximum occupancy filter can be applied to the day
sample information column of our data using:
-maximum_occupancy_filtered <- d %>%
- occupancyMaximum(cls = 'day',occupancy = 2/3)
It can be seen below that this removes 240 features.
-
-print(maximum_occupancy_filtered)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 1760
-#> Info: 9
Plotting the occupancy distributions shows that all the low occupancy features have now been removed.
-
-maximum_occupancy_filtered %>%
- plotOccupancy(cls = 'day')
The alternative strategy is by applying a minimum threshold; where the minimum occupancy across all classes is required to be above the threshold. Therefore, for a feature to be retained, all classes would need to have an occupancy above the threshold. A two-thirds minimum occupancy filter can be applied to the day
sample information column of our data using:
-minimum_occupancy_filtered <- d %>%
- occupancyMinimum(cls = 'day',occupancy = 2/3)
It can be seen below that this removes 344 features.
-
-print(minimum_occupancy_filtered)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 1656
-#> Info: 9
Prior to downstream analyses, metabolomics data often require transformation to fulfill the assumptions of a particular statistical/data mining technique.
-There are a wide range of transformation methods available that are commonly used for the analysis of metabolomics data. These methods are all named with the prefix transform
.
The effects of a transformation on a data set can be assessed using a supervised classifcation approach. The following performs a supervised random forest analysis of the example data and plots the results using both multidimensional scaling (MDS) and reciever operator characteristic (ROC) curves.
-
-d %>%
- plotSupervisedRF(cls = 'day')
Alternatively a log10 transformation can be applied prior to analysis:
-
-d %>%
- transformLog10() %>%
- plotSupervisedRF(cls = 'day')
Or a total ion count (TIC) normalisation where each individual sample is corrected by its TIC. This is one method that can be used to account for small variablility in sample concentration.
-
-d %>%
- transformTICnorm() %>%
- plotSupervisedRF(cls = 'day')
The margin value is a metric that can be used to assess model perfomance. Positive values indicate a models ability, on average, to correctly predict the class labels of the analysed data.
-As can be seen in the plots above, the transformations have little effect on the overall structure of the data set. However, there are small increases in the margins of the transformed data (model improvement). Note that here, a non-parametric machine learning approach has been applied to assess the effects of the transformations on the data. Using a different approach such as the parametric analysis Of variance (ANOVA) which different underlying assumptions will likely give different results to the assessment above.
-Sample aggregation allows the electronic pooling of samples based on a grouping variable. This is useful in situations such as the presence of technical replicates that can be aggregated to reduce the effects of pseudo replication. metabolyseR
provides methods for mean, median and sum aggregation and each starts with the aggregate
prefix.
Below shows a principle component analysis (PCA) plot of the example data coloured by the classes of the day
sample information column. It is first maximum occupancy filtered to remove empty features.
-d %>%
- occupancyMaximum(cls = 'day') %>%
- plotPCA(cls = 'day')
The example below shows the mean aggregation of the data using the experimental classes within the day
sample information column.
-day_mean <- d %>%
- occupancyMaximum(cls = 'day') %>%
- aggregateMean(cls = 'day')
The PCA plot below shows these class averages of the data.
-
-plotPCA(day_mean,cls = 'day',ellipses = FALSE)
There can sometimes be artificial batch related variability introduced into metabolomics analyses as a result of analytical instrumentation or sample preparation. With appropriate sample randomisation (see section on feature filtering based on QC samples), batch related variability can be corrected for using an average centring correction method, applied to the individual features.
-The plot below shows differences in the TIC distributions for each of the classes in the day
sample information column.
-d %>%
- plotTIC(by = 'day',colour = 'day')
The data can then be corrected by class average centring as shown below.
-
-corrected_data <- d %>%
- correctionCenter(block = 'day',type = 'median')
The plot of the TICs below shows that the inter-class variability has been removed but the intra-class variability has been retained.
-
-plotTIC(corrected_data,
- by = 'day',
- colour = 'day')
Missing values can have an important influence on downstream analyses with zero values heavily influencing the outcomes of parametric tests. Where and how they are imputed are important considerations and this is highly related to variable occupancy. The methods provided here allow both these aspects to be taken into account and utilise Random Forest imputation using the missForest package.
-Below shows a Linear Discriminant Analysis (LDA) plot of the example data. The eigenvalue (Tw) gives a comparable indication of the separation between the sample classes.
-
-d %>%
- keepClasses(cls = 'day',classes = c('H','5')) %>%
- occupancyMaximum(cls = 'day',occupancy = 2/3) %>%
- plotLDA(cls = 'day')
The following shows the same, except there is an application of imputation prior to the LDA. The imputed data is based on the data of all the samples present on the data set. It shows a very slight drop in the eigenvalue and therefore reduced separation between the sample classes.
-
-d %>%
- keepClasses(cls = 'day',classes = c('H','5')) %>%
- occupancyMaximum(cls = 'day',occupancy = 2/3) %>%
- imputeAll(parallel = 'variables') %>%
- plotLDA(cls = 'day')
Imputation accuracy is likely to be reduced if data is sparse or there is underlying class structure where there is significant discrimination. Below shows the application imputation prior the LDA, except this time the imputation is class-wise. The imputed data is based only on the values of other samples within the class.
-
-d %>%
- keepClasses(cls = 'day',classes = c('H','5')) %>%
- occupancyMaximum(cls = 'day',occupancy = 2/3) %>%
- imputeClass(cls = 'day') %>%
- plotLDA(cls = 'day')
This shows a slight increase in the eigenvalue with the classes showing greater separation. This is likely due to the increased accuracy of the imputed data relative to the class structure.
-A QC sample is an average pooled sample, equally representative in composition of all the samples present within an experimental set. Within an analytical run, the QC sample is analysed at equal intervals throughout the run. If there is class structure within the run, this should be randomised within a block fashion so that the classes are equally represented in each block throughout the run. A QC sample can then be injected and analysed between these randomised blocks. This provides a set of technical injections that allows the variability in instrument performance over the run to be accounted for and the robustness of the acquired variables to be assessed.
-The technical reproducibility of an acquired variable can be assessed using it’s relative standard deviation (RSD) within the QC samples. The variable RSDs can then be filtered below a threshold value to remove metabolome features that are poorly reproducible across the analytical runs. This variable filtering strategy has an advantage over that of occupancy alone as it is not dependent on underlying class structure. Therefore, the variables and variable numbers will not alter if a new class structure is imposed upon the data.
-The example data set does not include QC samples. For this example, the H
class will be used.
Firstly, the RSD distribution will be assessed for the only H
class. The following retains only the H
class samples to aid visualisation.
-QC <- d %>%
- keepClasses(cls = 'day',classes = 'H')
The table of RSD values for each of the features can be computed as below.
-
-QC %>%
- rsd(cls = 'day')
-#> # A tibble: 2,000 × 5
-#> day Feature Mean SD RSD
-#> <fct> <chr> <dbl> <dbl> <dbl>
-#> 1 H N1 0 0 NaN
-#> 2 H N10 0 0 NaN
-#> 3 H N100 0 0 NaN
-#> 4 H N1000 114. 19.4 17.0
-#> 5 H N1001 99.2 21.6 21.7
-#> 6 H N1002 86.7 23.9 27.6
-#> 7 H N1003 82.3 18.0 21.9
-#> 8 H N1004 91.6 18.8 20.5
-#> 9 H N1005 78.2 14.0 17.9
-#> 10 H N1006 78.6 21.3 27.1
-#> # … with 1,990 more rows
The distributions of the feature RSD values can be plotted for the H
class.
-QC %>%
- plotRSD(cls = 'day')
-#> Warning: Removed 123 rows containing non-finite values (stat_density).
-#> Warning: Removed 1 row(s) containing missing values (geom_path).
This shows that there are a number of features with very high RSD values and therefore poor analytical robustness. Many of these are likely to be as a result of poor occupancy and zero values. Applying an occupancy filter prior to plotting does indeed show a reduction in the upper range of RSD values retained.
-
-QC %>%
- occupancyMaximum(cls = 'day',occupancy = 2/3) %>%
- plotRSD(cls = 'day')
metabolyseR
contains a number of methods for applying pre-treatment routines specifically on QC samples and are all prefixed with QC
. These include methods for feature filtering of a data set based the occupancy of the QC class, imputation of the QC class only, feature filtering based in the RSD values of the QC class and removal of only the QC class.
Below shows an example of applying some of these QC methods. This will first filter the features in the data set based on the occupancy of the QC class. Then the features are filtered based on the RSD values of the QC class using an RSD threshold of 50%. The class index of the QC samples is specified using the QCidx
argument.
-QC_filtered <- d %>%
- QCoccupancy(cls = 'day',QCidx = 'H',occupancy = 2/3) %>%
- QCrsdFilter(cls = 'day',QCidx = 'H',RSDthresh = 50)
This removes a total of 637 features.
-
-print(QC_filtered)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 1363
-#> Info: 9
For routine analyses, the available pre-treatment elements can retreived using:
-
-preTreatmentElements()
-#> [1] "aggregate" "correction" "impute" "keep"
-#> [5] "occupancyFilter" "QC" "remove" "transform"
The available methods for a specified pre-treatment element can be viewed using:
-
-preTreatmentMethods('remove')
-#> [1] "classes" "features" "samples"
The default pre-treatment parameters can first be assigned to the variable p
.
-p <- analysisParameters('pre-treatment')
The preTreatmentParameters()
function allows the parameters for particular pre-treatment elements to be specified. The following specifies the pre-treatment elements that will be used for this data set. These will include the keeping of certain sample classes, the filtering of features based on class occupancy and the application of a TIC normalisation. These will be assigned to the p
variable using the parameters()
method.
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(
- keep = 'classes',
- occupancyFilter = 'maximum',
- transform = 'TICnorm'
- )
-)
Printing p
shows these pre-treatment steps.
-print(p)
-#> Parameters:
-#> pre-treatment
-#> keep
-#> classes
-#> cls = class
-#> classes = c()
-#> occupancyFilter
-#> maximum
-#> cls = class
-#> occupancy = 2/3
-#> transform
-#> TICnorm
Next, the day
sample information column can be specified, along with the classes to be kept which will be the H
, the 1
and the 2
classes.
-changeParameter(p,'cls') <- 'day'
-changeParameter(p,'classes') <- c('H','1','2')
Printing p
shows the final pre-treatment parameters that will be used for this analysis.
-print(p)
-#> Parameters:
-#> pre-treatment
-#> keep
-#> classes
-#> cls = day
-#> classes = c("H", "1", "2")
-#> occupancyFilter
-#> maximum
-#> cls = day
-#> occupancy = 2/3
-#> transform
-#> TICnorm
The pre-treatment routine can then be executed.
-<- metabolyse(abr1$neg,abr1$fact,p)
- analysis #>
-#> metabolyseR v0.14.3 Tue Sep 14 10:12:19 2021
-#> ________________________________________________________________________________
-#> Parameters:
-#> pre-treatment
-#> keep
-#> classes
-#> cls = day
-#> classes = c("H", "1", "2")
-#> occupancyFilter
-#> maximum
-#> cls = day
-#> occupancy = 2/3
-#> transform
-#> TICnorm
-#> ________________________________________________________________________________
-#> Pre-treatment …
-
--treatment ✓ [9.1S]
- Pre#> ________________________________________________________________________________
-#>
-#> Complete! [9.1S]
Printing the analysis
object shows the resulting data from the pre-treatment routine.
-print(analysis)
-#>
-#> metabolyseR v0.14.3
-#> Analysis:
-#> Tue Sep 14 10:12:19 2021
-#>
-#> Raw Data:
-#> No. samples = 120
-#> No. features = 2000
-#>
-#> Pre-treated Data:
-#> Tue Sep 14 10:12:28 2021
-#> No. samples = 60
-#> No. features = 1723
The pre-treated data can be extracted from the Analysis
object using several methods.
Firstly the analysisResults()
method.
-analysisResults(analysis,'pre-treatment')
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 60
-#> Features: 1723
-#> Info: 9
And secondly the preTreated()
method.
-preTreated(analysis)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 60
-#> Features: 1723
-#> Info: 9
A supervised random forest analysis can be used to visualise the structure of the resulting pre-treated data.
-
-analysis %>%
- plotSupervisedRF(cls = 'day',type = 'pre-treated')
vignettes/04_modelling.Rmd
- 04_modelling.Rmd
Modelling provides the essential data mining step for extracting biological information and explanatory metabolome features from a data set relating to the experimental conditions. metabolyseR
provides a number of both univariate and multivariate methods for data mining.
For an introduction to the usage of metabolyseR for both exploratory and routine analyses, see the introduction vignette using:
-
-vignette('introduction','metabolyseR')
To further supplement this document, a quick start example analysis is also available as a vignette:
-
-vignette('quick_start','metabolyseR')
To begin, the package can be loaded using:
-
-library(metabolyseR)
-#>
-#> Attaching package: 'metabolyseR'
-#> The following object is masked from 'package:stats':
-#>
-#> anova
-#> The following objects are masked from 'package:base':
-#>
-#> raw, split
The examples used here will use the abr1
data set from the metaboData package. This is nominal mass flow-injection mass spectrometry (FI-MS) fingerprinting data from a plant-pathogen infection time course experiment. The pipe %>%
from the magrittr package will also be used. The example data can be loaded using:
Only the negative acquisition mode data (abr1$neg
) will be used along with the sample meta-information (abr1$fact
). Create an AnalysisData
class object, assigned to the variable d
, using the following:
-d <- analysisData(abr1$neg[,1:500],abr1$fact)
-print(d)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 500
-#> Info: 9
As can be seen above the data set contains a total of 120 samples and 500 features.
-The package supports parallel processing using the future package.
-By default, processing by metabolyseR
will be done seqentially. However, parallel processing can be activated, prior to analysis, by specifying a parallel implementation using plan()
. The following example specifies using the multisession
implementation (muliple background R sessions) with two worker processes.
-plan(future::multisession,workers = 2)
See the future package documentation for more information on the types of parallel implementations that are available.
-Random forest is a versatile ensemble machine learning approach based on forests of decision trees for multivariate data mining. This can include unsupervised analysis, classification of discrete response variables and regression of continuous responses.
-Random forest can be performed in metabolyseR
using the randomForest()
method. For further details on the arguments for using this function, see ?randomForest
. This implementation of random forest in metabolyseR
utilises the randomForest
package. See ?randomForest::randomForest
for more information about that implementation.
The unsupervised random forest approach can be useful starting point for analysis in any experimental context. It can be used to give a general overview of the structure of the data and to identify any possible problems. These could include situations such as the presence of outliers samples or splits in the data caused by the impact of analytical or sample preparation factors. Unsupervised random forest can have advantages in these assessments over other approaches such as Principle Component Analysis (PCA). It is less sensitive to the effect of a single feature that in fact could have little overall impact relative to the other hundreds that could be present in a data set.
-The examples below will show the use of unsupervised random forest for assessing the general structure of the example data set and the presence of outlier samples.
-Unsupervised random forest can be performed by setting the cls
argument of randomForest()
to NULL
:
-unsupervised_rf <- d %>%
- randomForest(cls = NULL)
The type of random forest that has been performed can be checked using the type
method.
-type(unsupervised_rf)
-#> [1] "unsupervised"
Or by printing the results object.
-
-unsupervised_rf
-#>
-#> Unsupervised random forest
-#>
-#> Samples: 120
-#> Features: 500
Firstly, the presence of outlier samples will be assessed. A multidimensional scaling (MDS) plot can be used to visualise the relative proximity of the observations, as shown in the following. The individual points are also labelled by their injection order to enable the identification of individual samples if necessary.
-
-plotMDS(unsupervised_rf,
- cls = NULL,
- label = 'injorder',
- labelSize = 3,
- title = 'Outlier detection')
-#> Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
-#> increasing max.overlaps
From the plot above, it can be seen a single sample lies outside the 95% confidence ellipse. It is unlikely that this sample can be considered an outlier as it’s position is as a result of the underlying class structure as opposed to differences specific to that individual sample.
-The structure of these observations can be investigated further by colouring the points by a different experimental factor. This will be by the day
class column which is the main experimental factor of interest in this experiment.
-plotMDS(unsupervised_rf,
- cls = 'day')
This shows that it is indeed the experimental factor of interest that is having the greatest impact on the structure of the data. The progression of the experimental time points are obvious across Dimension 1.
-The available feature importance metrics for a random forest analysis can be retrieved by:
-
-importanceMetrics(unsupervised_rf)
-#> [1] "1" "2" "FalsePositiveRate"
-#> [4] "MeanDecreaseAccuracy" "MeanDecreaseGini" "SelectionFrequency"
And the importance values of these metrics for each feature can returned using:
-
-importance(unsupervised_rf)
-#> # A tibble: 3,000 × 3
-#> Feature Metric Value
-#> <chr> <chr> <dbl>
-#> 1 N1 1 0
-#> 2 N1 2 0
-#> 3 N1 FalsePositiveRate 0.0238
-#> 4 N1 MeanDecreaseAccuracy 0
-#> 5 N1 MeanDecreaseGini 0
-#> 6 N1 SelectionFrequency 0
-#> 7 N10 1 0
-#> 8 N10 2 0
-#> 9 N10 FalsePositiveRate 0.0238
-#> 10 N10 MeanDecreaseAccuracy 0
-#> # … with 2,990 more rows
The explanatory features for a given threshold can be extracted for any of the importance metrics. The following will extract the explanatory features below a threshold of 0.05 based on the false positive rate metric.
-
-unsupervised_rf %>%
- explanatoryFeatures(metric = "FalsePositiveRate",
- threshold = 0.05)
-#> # A tibble: 359 × 3
-#> Feature Metric Value
-#> <chr> <chr> <dbl>
-#> 1 N342 FalsePositiveRate 1.31e-19
-#> 2 N161 FalsePositiveRate 2.34e-16
-#> 3 N341 FalsePositiveRate 6.50e-16
-#> 4 N315 FalsePositiveRate 1.79e-15
-#> 5 N367 FalsePositiveRate 3.47e-14
-#> 6 N173 FalsePositiveRate 9.09e-14
-#> 7 N385 FalsePositiveRate 9.09e-14
-#> 8 N133 FalsePositiveRate 1.52e-12
-#> 9 N439 FalsePositiveRate 1.52e-12
-#> 10 N379 FalsePositiveRate 3.78e-12
-#> # … with 349 more rows
In this example there are 359 explanatory features.
-The trend of the most highly ranked explanatory feature against the day
factor can be plotted using the plotFeature()
method.
-unsupervised_rf %>%
- plotFeature(feature = 'N425',
- cls = 'day')
Random forest classification can be used to assess the extent of discrimination (difference) between classes of a discrete response variable. This includes both multinomial (number of classes > 2) and binary (number of classes = 2) comparisons.
-In multinomial situations, the suitability of a multinomial comparison versus multiple binary comparisons can depend on the experimental context. For instance, in a treatment/control experiment that includes multiple time points, a multinomial comparison using all available classes could be useful to visualise the general structure of the data. However, it could make any extracted explanatory features difficult to reason about as to how they relate to the individual experimental time point or treatment conditions. An investigator could instead identify the binary comparisons relevant to the biological question and focus the further classification comparisons to better select for explanatory features.
-In experiments with more than two classes, multinomial random forest classification can be used to assess the discrimination between the classes and give an overview of the relative structure between classes.
-The example data set consists of a total of 6 classes for the day
response variable.
-d %>%
- clsExtract(cls = 'day') %>%
- unique()
-#> [1] 2 3 4 1 H 5
-#> Levels: 1 2 3 4 5 H
Multinomial classification can be performed by:
-
-multinomial_rf <- d %>%
- randomForest(cls = 'day')
-
-print(multinomial_rf)
-#>
-#> Random forest classification
-#>
-#> Samples: 120
-#> Features: 500
-#> Response: day
-#> # comparisons: 1
The performance of this model can be assessed using metrics based on the success of the out of bag (OOB) predictions. The performance metrics can be extracted using:
-
-multinomial_rf %>%
- metrics()
-#> # A tibble: 4 × 5
-#> Response Comparison .metric .estimator .estimate
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 day 1~2~3~4~5~H accuracy multiclass 0.8
-#> 2 day 1~2~3~4~5~H kap multiclass 0.76
-#> 3 day 1~2~3~4~5~H roc_auc hand_till 0.964
-#> 4 day 1~2~3~4~5~H margin <NA> 0.146
These metrics include accuracy, Cohen’s kappa (kap), area under the receiver operator characteristic curve (roc_auc, ROC-AUC) and margin. Each metric has both strengths and weaknesses that depend on the context of the classification such as the balance of observations between the classes. As shown below, the class frequencies for this example are balanced with 20 observations per class.
-
-d %>%
- clsExtract(cls = 'day') %>%
- table()
-#> .
-#> 1 2 3 4 5 H
-#> 20 20 20 20 20 20
In this context, each of these metrics could be used to assess the predictive performance of the model. The margin metric is the difference between the proportion of votes for the correct class and the maximum proportion of votes for the other classes for a given observation which is then averaged across all the observations. A positive margin value indicates correct classification and values greater than 0.2 can be considered as the models having strong predictive power. The margin also allows the extent of discrimination to be discerned even in very distinct cases above where both the accuracy and ROC-AUC would be registering values of 1.
-In this example, the values of all the metrics suggest that the model is showing good predictive performance. This can be investigated further by plotting the MDS of observation proximity values.
-
-multinomial_rf %>%
- plotMDS(cls = 'day')
This shows that the model is able to discriminate highly between classes such as 5
and H
. It is less able to discriminate more similar classes such as H
and 1
or 4
and 5
whose confidence ellipses show a high degree of overlap. This makes sense in the context of this experiment as these are adjacent time points that are more likely to be similar than time points at each end of the experiment.
The ROC curves can also be plotted as shown below.
-
-multinomial_rf %>%
- plotROC()
Classes with their line further from the central dashed line are those that were predicted with the greatest reliability by the model. This plot shows that both the H
and 1
classes were least reliably predicted which is a result of their close proximity shown in the MDS plot previously.
Importance metrics can be used to identify the metabolome features that contribute most to the class discrimination in the model. The available importance metrics for this model are shown below.
-
-importanceMetrics(multinomial_rf)
-#> [1] "1" "2" "3"
-#> [4] "4" "5" "FalsePositiveRate"
-#> [7] "H" "MeanDecreaseAccuracy" "MeanDecreaseGini"
-#> [10] "SelectionFrequency"
Here, we will use the false positive rate metric with a threshold of below 0.05 to identify explanatory features for the day
response variable.
-multinomial_rf %>%
- explanatoryFeatures(metric = 'FalsePositiveRate',
- threshold = 0.05)
-#> # A tibble: 121 × 5
-#> Response Comparison Feature Metric Value
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 day 1~2~3~4~5~H N341 FalsePositiveRate 1.02e-93
-#> 2 day 1~2~3~4~5~H N133 FalsePositiveRate 7.38e-68
-#> 3 day 1~2~3~4~5~H N163 FalsePositiveRate 3.59e-61
-#> 4 day 1~2~3~4~5~H N439 FalsePositiveRate 1.07e-54
-#> 5 day 1~2~3~4~5~H N342 FalsePositiveRate 3.19e-49
-#> 6 day 1~2~3~4~5~H N377 FalsePositiveRate 3.19e-49
-#> 7 day 1~2~3~4~5~H N171 FalsePositiveRate 6.26e-44
-#> 8 day 1~2~3~4~5~H N497 FalsePositiveRate 6.11e-30
-#> 9 day 1~2~3~4~5~H N146 FalsePositiveRate 2.74e-29
-#> 10 day 1~2~3~4~5~H N195 FalsePositiveRate 7.16e-25
-#> # … with 111 more rows
As shown above there were a total of 121 explanatory features identified.
-Within a multinomial experiment, it is also possible to specify the exact class comparisons to include, where it might not be suitable to compare all the classes at once using the comparisons
argument. This should be specified as a named list, the corresponding to the cls
argument. Each named element should then consist of a vector of comparisons, the classes to compare separated using the ~
.
The following specifies two comparisons (H~1~2
,H~1~5
) for the day
response variable and displays the performance metrics.
-d %>%
- randomForest(cls = 'day',
- comparisons = list(day = c('H~1~2',
- 'H~1~5'))) %>%
- metrics()
-#> # A tibble: 8 × 5
-#> Response Comparison .metric .estimator .estimate
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 day H~1~2 accuracy multiclass 0.833
-#> 2 day H~1~2 kap multiclass 0.75
-#> 3 day H~1~5 accuracy multiclass 0.75
-#> 4 day H~1~5 kap multiclass 0.625
-#> 5 day H~1~2 roc_auc hand_till 0.906
-#> 6 day H~1~5 roc_auc hand_till 0.909
-#> 7 day H~1~2 margin <NA> 0.172
-#> 8 day H~1~5 margin <NA> 0.320
The MDS and ROC curve plots can also be plotted simultaneously for the two comparisons.
-
-d %>%
- randomForest(cls = 'day',
- comparisons = list(day = c('H~1~2',
- 'H~1~5'))) %>%
- {plotMDS(.,cls = 'day') +
- plotROC(.) +
- patchwork::plot_layout(ncol = 1)}
Similarly, it is also possible to model multiple response factors with a single random forest call by specifying a vector of response class information column names to the cls
argument. In the following, both the name
and day
response factors will be analysed and the performance metrics returned in a single table.
-d %>%
- randomForest(cls = c('name','day')) %>%
- metrics()
-#> Warning: Classes with < 5 replicates removed: "11_3", "11_4", "11_5", "11_6",
-#> "11_H", "12_1", "12_3", "12_6", "12_H", "13_1", "13_2", "13_3", "13_5", "13_6",
-#> "13_H", "14_2", "14_3", "14_5", "14_6", "14_H", "15_1", "15_2", "15_4", "15_5",
-#> "15_6", "15_H"
-#> Unbalanced classes detected. Stratifying sample size to the smallest class size.
-#> # A tibble: 8 × 5
-#> Response Comparison .metric .estimator .estimate
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 name 11_2~12_2~12_4~13_4~14_4~15_3 accuracy multiclass 0.35
-#> 2 name 11_2~12_2~12_4~13_4~14_4~15_3 kap multiclass 0.212
-#> 3 name 11_2~12_2~12_4~13_4~14_4~15_3 roc_auc hand_till 0.753
-#> 4 name 11_2~12_2~12_4~13_4~14_4~15_3 margin <NA> -0.0485
-#> 5 day 1~2~3~4~5~H accuracy multiclass 0.8
-#> 6 day 1~2~3~4~5~H kap multiclass 0.76
-#> 7 day 1~2~3~4~5~H roc_auc hand_till 0.964
-#> 8 day 1~2~3~4~5~H margin <NA> 0.146
The MDS plots can also be returned for both models simultaneously.
-
-d %>%
- randomForest(cls = c('name','day')) %>%
- plotMDS()
-#> Warning: Classes with < 5 replicates removed: "11_3", "11_4", "11_5", "11_6",
-#> "11_H", "12_1", "12_3", "12_6", "12_H", "13_1", "13_2", "13_3", "13_5", "13_6",
-#> "13_H", "14_2", "14_3", "14_5", "14_6", "14_H", "15_1", "15_2", "15_4", "15_5",
-#> "15_6", "15_H"
-#> Unbalanced classes detected. Stratifying sample size to the smallest class size.
It may in some cases be preferable to analyse class comparisons as multiple binary comparisons.
-The possible binary comparisons for a given response variable can be displayed using the binaryComparisons()
method. Below shows the 15 comparisons for the day
response variable.
-binaryComparisons(d,cls = 'day')
-#> [1] "1~2" "1~3" "1~4" "1~5" "1~H" "2~3" "2~4" "2~5" "2~H" "3~4" "3~5" "3~H"
-#> [13] "4~5" "4~H" "5~H"
For this example we will only use the binary comparisons containing the H
class.
-binary_comparisons <- binaryComparisons(d,cls = 'day') %>%
- .[stringr::str_detect(.,'H')]
The binary comparisons can then be performed using the following.
-
-binary_rf <- d %>%
- randomForest(cls = 'day',
- comparisons = list(day = binary_comparisons))
-
-print(binary_rf)
-#>
-#> Random forest classification
-#>
-#> Samples: 120
-#> Features: 500
-#> Response: day
-#> # comparisons: 5
To run all possible binary comparisons, the binary = TRUE
argument could instead be used.
The MDS plots for each comparison can be visualised to inspect the comparisons.
-
-binary_rf %>%
- plotMDS(cls = 'day')
These plots show good separation in all the comparisons except H~1
which is also shown by the plot of the performance metrics below. Each of the comparisons are showing perfect performance for the accuracy, Cohen’s kappa and ROC-AUC metrics as well as very high margin values except for the H~1
comparison.
-binary_rf %>%
- plotMetrics()
The explanatory features for these comparisons can be extracted as below using the false positive rate metric and a cut-off threshold of 0.05. This gives a total of 251 explanatory features.
-
-binary_rf %>%
- explanatoryFeatures(metric = 'FalsePositiveRate',
- threshold = 0.05)
-#> # A tibble: 251 × 5
-#> Response Comparison Feature Metric Value
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 day 2~H N341 FalsePositiveRate 7.34e-52
-#> 2 day 2~H N439 FalsePositiveRate 1.80e-45
-#> 3 day 3~H N342 FalsePositiveRate 2.71e-39
-#> 4 day 2~H N327 FalsePositiveRate 1.06e-35
-#> 5 day 3~H N439 FalsePositiveRate 1.06e-35
-#> 6 day 2~H N477 FalsePositiveRate 1.60e-34
-#> 7 day 3~H N377 FalsePositiveRate 1.60e-34
-#> 8 day 4~H N477 FalsePositiveRate 7.40e-34
-#> 9 day 2~H N447 FalsePositiveRate 6.48e-30
-#> 10 day 3~H N163 FalsePositiveRate 6.48e-30
-#> # … with 241 more rows
A heatmap of these explanatory features can be plotted to show their mean relative intensities across the experiment time points. Here, the classes are also refactored to customise the order of the classes on the x-axis.
-
-refactor_cls <- clsExtract(binary_rf,
- cls = 'day') %>%
- factor(.,levels = c('H','1','2','3','4','5'))
-
-binary_rf <- clsReplace(binary_rf,
- value = refactor_cls,
- cls = 'day')
-binary_rf %>%
- plotExplanatoryHeatmap(metric = 'FalsePositiveRate',
- threshold = 0.05,
- featureNames = TRUE)
Random forest regression can be used to assess the extent of association of the metabolomic data with continuous response variables.
-In this example, the extent of association of injection order with the example data will be assessed.
-
-regression_rf <- d %>%
- randomForest(cls = 'injorder')
-
-print(regression_rf)
-#>
-#> Random forest regression
-#>
-#> Samples: 120
-#> Features: 500
-#> Response: injorder
The regression model performance metrics, based on the OOB prediction error, can be extracted using the following:
-
-regression_rf %>%
- metrics()
-#> # A tibble: 5 × 4
-#> Response .metric .estimator .estimate
-#> <chr> <chr> <chr> <dbl>
-#> 1 injorder rsq standard 0.476
-#> 2 injorder mae standard 23.5
-#> 3 injorder mape standard 154.
-#> 4 injorder rmse standard 26.5
-#> 5 injorder ccc standard 0.508
These regression metrics include R2 (rsq
), mean absolute error (mae
), mean absolute percentage error (mape
), root mean squared error (rmse
) and the concordance correlation coefficient (ccc
).
The R2 and concordance correlation coefficient metrics suggest that there is some association of features with the injection order, although this is weak. This is in agreement with mean absolute error metric that shows that on average, the injection order could only be predicted to an accuracy of 23 injection order positions.
-The MDS plot belows the relative proximities of the samples based on this injection order regression model. This shows that for the most part, there is little correspondence of the sample positions with their injection order. However, there is a small grouping of samples towards the end of the run around sample ~99 to 120. It suggests that there could have been some analytical issues, for certain features, towards the end of the mass spectral analytical run.
-
-regression_rf %>%
- plotMDS(cls = NULL,
- ellipses = FALSE,
- label = 'injorder',
- labelSize = 3)
-#> Warning: ggrepel: 40 unlabeled data points (too many overlaps). Consider
-#> increasing max.overlaps
The available feature importance metrics for this regression model can be listed.
-
-regression_rf %>%
- importanceMetrics()
-#> [1] "%IncMSE" "IncNodePurity"
The feature importance metrics can be plotted to give an overview of their distribution. The following will plot the percentage increase in the mean squared error (%IncMSE
) importance metric.
-regression_rf %>%
- plotImportance(metric = "%IncMSE",
- rank = FALSE)
This shows that there are only a few features that are contributing to the association with injection order. These explanatory features can be extracted with the following, using a threshold of above 5.
-
-regression_rf %>%
- explanatoryFeatures(metric = '%IncMSE',
- threshold = 5)
-#> # A tibble: 7 × 4
-#> Response Feature Metric Value
-#> <chr> <chr> <chr> <dbl>
-#> 1 injorder N283 %IncMSE 19.9
-#> 2 injorder N135 %IncMSE 8.71
-#> 3 injorder N451 %IncMSE 5.58
-#> 4 injorder N161 %IncMSE 5.51
-#> 5 injorder N306 %IncMSE 5.49
-#> 6 injorder N118 %IncMSE 5.22
-#> 7 injorder N297 %IncMSE 5.07
This returned a total of 7 explanatory features above this threshold. The top ranked feature N283
can be plotted to investigate it’s trend in relation to injection order.
-regression_rf %>%
- plotFeature(feature = 'N283',
- cls = 'injorder')
This shows an increase in the intensity of that feature for samples above 100 in the injection order which corresponds with the cluster that was seen in the MDS plot above.
-Univariate methods select features, explanatory for response variables, with features tested on an individual basis. These methods offer simplicity and easy interpretation in their use, however they provide no information as to how features may interact.
-The univariate methods currently available in metabolyseR
include Welch’s t-test, analysis of variance (ANOVA) and linear regression. The following sections will provide brief examples of the use of each of these methods.
Welch’s t-test can be used to select explanatory metabolome features for binary comparisons of discrete variables. By default, all the possible binary comparisons for the categories of a response variable will be tested.
-Below shows the possible binary comparisons for the day
response variable for the example data set.
-binaryComparisons(d,
- cls = 'day')
-#> [1] "1~2" "1~3" "1~4" "1~5" "1~H" "2~3" "2~4" "2~5" "2~H" "3~4" "3~5" "3~H"
-#> [13] "4~5" "4~H" "5~H"
For the following example, only a subset of comparisons will be tested. These will be selected by supplying a list to the comparisons
argument.
-ttest_analysis <- ttest(d,
- cls = 'day',
- comparisons = list(day = c('H~1',
- 'H~2',
- 'H~5')))
-
-print(ttest_analysis)
-#>
-#> Univariate t-test analysis
-#>
-#> Samples: 120
-#> Features: 500
-#> Responses: day
-#> # comparisons: 3
The explanatory features that show a significant difference between the response categories can be extracted as shown below.
-
-explanatoryFeatures(ttest_analysis,
- threshold = 0.05)
-#> # A tibble: 73 × 14
-#> Response Comparison Feature estimate estimate1 estimate2 statistic p.value
-#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 day H~5 N163 -735. 19.5 755. -13.8 1.43e-11
-#> 2 day H~5 N341 2445. 2537. 92.6 13.6 2.88e-11
-#> 3 day H~5 N133 1055. 1077. 21.9 13.0 5.44e-11
-#> 4 day H~2 N341 200. 293. 92.6 10.6 1.38e-10
-#> 5 day H~5 N171 62.6 64.7 2.15 11.9 2.62e-10
-#> 6 day H~5 N119 17.2 17.9 0.763 11.0 8.54e-10
-#> 7 day H~5 N342 243. 247. 4.13 10.8 1.42e- 9
-#> 8 day H~5 N343 27.4 28.3 0.961 9.83 5.99e- 9
-#> 9 day H~5 N377 152. 157. 5.05 9.81 6.75e- 9
-#> 10 day H~5 N477 103. 129. 26.1 9.30 1.05e- 8
-#> # … with 63 more rows, and 6 more variables: parameter <dbl>, conf.low <dbl>,
-#> # conf.high <dbl>, method <chr>, alternative <chr>, adjusted.p.value <dbl>
This will threshold the features based on their adjusted p-value, found in the adjusted.p.value
column of the table. The results of all of the features can be returned using the importance()
method.
A heat map of the explanatory features can be plotted to inspect the relative trends of the explanatory features in relation to the response variable
.
-plotExplanatoryHeatmap(ttest_analysis)
ANOVA can be used to select explanatory features for discrete response variables with 3 or more categories. The following example will compare all the categories in the day
response variable. However, the comparisons
argument can be used to select particular comparisons of interest.
-anova_analysis <- anova(d,
- cls = 'day')
-
-print(anova_analysis)
-#>
-#> Univariate ANOVA analysis
-#>
-#> Samples: 120
-#> Features: 500
-#> Responses: day
-#> # comparisons: 1
The explanatory features that are significantly different between the categories can then be extracted.
-
-explanatoryFeatures(anova_analysis,
- threshold = 0.05)
-#> # A tibble: 110 × 10
-#> Response Comparison Feature term df sumsq meansq statistic p.value
-#> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 day 1~2~3~4~5~H N341 response 5 1.09e8 2.17e7 124. 1.90e-44
-#> 2 day 1~2~3~4~5~H N163 response 5 1.25e7 2.51e6 113. 1.71e-42
-#> 3 day 1~2~3~4~5~H N133 response 5 1.96e7 3.92e6 108. 1.71e-41
-#> 4 day 1~2~3~4~5~H N171 response 5 6.29e4 1.26e4 88.8 1.16e-37
-#> 5 day 1~2~3~4~5~H N342 response 5 1.04e6 2.07e5 85.1 7.61e-37
-#> 6 day 1~2~3~4~5~H N343 response 5 1.19e4 2.38e3 66.1 4.43e-32
-#> 7 day 1~2~3~4~5~H N119 response 5 4.92e3 9.83e2 53.8 2.07e-28
-#> 8 day 1~2~3~4~5~H N497 response 5 1.10e5 2.20e4 49.6 4.83e-27
-#> 9 day 1~2~3~4~5~H N137 response 5 6.32e3 1.26e3 39.9 1.59e-23
-#> 10 day 1~2~3~4~5~H N277 response 5 6.31e4 1.26e4 39.1 3.14e-23
-#> # … with 100 more rows, and 1 more variable: adjusted.p.value <dbl>
The top ranked explanatory feature N341
can be plotted to inspect it’s trend relative to the day
response variable.
-plotFeature(anova_analysis,
- feature = 'N341',
- cls = 'day')
Univariate linear regression can be used to associate a continuous response variable with metabolome features. In the example below, the example data will be regressed against injection order to identify any linearly associated metabolome features.
-
-lr_analysis <- linearRegression(d,
- cls = 'injorder')
-
-print(lr_analysis)
-#>
-#> Univariate linear regression analysis
-#>
-#> Samples: 120
-#> Features: 500
-#> Responses: injorder
The explanatory features can then be extracted.
-
-explanatoryFeatures(lr_analysis)
-#> # A tibble: 8 × 15
-#> Response Feature r.squared adj.r.squared sigma statistic p.value df logLik
-#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 injorder N283 0.310 0.304 4.27 53.0 4.10e-11 1 -343.
-#> 2 injorder N135 0.165 0.157 78.7 23.2 4.31e- 6 1 -693.
-#> 3 injorder N221 0.140 0.133 5.87 19.3 2.50e- 5 1 -382.
-#> 4 injorder N473 0.135 0.127 7.24 18.3 3.78e- 5 1 -407.
-#> 5 injorder N335 0.132 0.124 20.1 17.9 4.59e- 5 1 -529.
-#> 6 injorder N452 0.120 0.112 4.00 16.0 1.10e- 4 1 -335.
-#> 7 injorder N255 0.119 0.111 11.1 15.9 1.17e- 4 1 -458.
-#> 8 injorder N267 0.118 0.111 26.4 15.8 1.22e- 4 1 -562.
-#> # … with 6 more variables: AIC <dbl>, BIC <dbl>, deviance <dbl>,
-#> # df.residual <int>, nobs <int>, adjusted.p.value <dbl>
The top ranked explanatory feature N283
can be plotted to inspect inspects it’s association with injection order.
-plotFeature(lr_analysis,
- feature = 'N283',
- cls = 'injorder')
For routine analyses, the initial analysis parameters for pre-treatment of the data and then the modelling can be selected.
-
-p <- analysisParameters(c('pre-treatment','modelling'))
More specific parameters for pre-treatment of the example data can be declared using the following.
-
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(
- keep = 'classes',
- occupancyFilter = 'maximum',
- transform = 'TICnorm'
- )
-)
The modellingMethods()
function can be used to list the modelling methods that are currently available in metabolyseR
.
-modellingMethods()
-#> [1] "anova" "ttest" "linearRegression" "randomForest"
The modellingParameters()
function can be used to retrieve the default parameters for specific modelling methods. Below, the default modelling parameters for the randomForest
and ttest
methods are specified.
-parameters(p,'modelling') <- modellingParameters(c('randomForest','ttest'))
The class parameters can the be universily specified for both the pre-treatment and modelling elements. For this example, the day
response variable will be used with just the H
and 2
classes.
-changeParameter(p,'cls') <- 'day'
-changeParameter(p,'classes') <- c('H','2')
This gives the following parameters for the analysis.
-
-p
-#> Parameters:
-#> pre-treatment
-#> keep
-#> classes
-#> cls = day
-#> classes = c("H", "2")
-#> occupancyFilter
-#> maximum
-#> cls = day
-#> occupancy = 2/3
-#> transform
-#> TICnorm
-#>
-#> modelling
-#> randomForest
-#> cls = day
-#> rf = list()
-#> reps = 1
-#> binary = FALSE
-#> comparisons = list()
-#> perm = 0
-#> returnModels = FALSE
-#> seed = 1234
-#> ttest
-#> cls = day
-#> pAdjust = bonferroni
-#> comparisons = list()
-#> returnModels = FALSE
The analysis can then be executed.
-<- metabolyse(abr1$neg,abr1$fact,p)
- analysis #>
-#> metabolyseR v0.14.3 Tue Sep 14 10:13:39 2021
-#> ________________________________________________________________________________
-#> Parameters:
-#> pre-treatment
-#> keep
-#> classes
-#> cls = day
-#> classes = c("H", "2")
-#> occupancyFilter
-#> maximum
-#> cls = day
-#> occupancy = 2/3
-#> transform
-#> TICnorm
-#>
-#> modelling
-#> randomForest
-#> cls = day
-#> rf = list()
-#> reps = 1
-#> binary = FALSE
-#> comparisons = list()
-#> perm = 0
-#> returnModels = FALSE
-#> seed = 1234
-#> ttest
-#> cls = day
-#> pAdjust = bonferroni
-#> comparisons = list()
-#> returnModels = FALSE
-#> ________________________________________________________________________________
-#> Pre-treatment …
-
--treatment ✓ [6.4S]
- Pre#> Modelling …
-
-4.2S]
- Modelling ✓ [#> ________________________________________________________________________________
-#>
-#> Complete! [10.6S]
The results for the modelling can be specifically extracted using the following.
-
-analysisResults(analysis,'modelling')
-#> $randomForest
-#>
-#> Random forest classification
-#>
-#> Samples: 40
-#> Features: 1713
-#> Response: day
-#> # comparisons: 1
-#>
-#>
-#> $ttest
-#>
-#> Univariate t-test analysis
-#>
-#> Samples: 40
-#> Features: 1713
-#> Responses: day
-#> # comparisons: 1
This returns the results as a list containing the modelling results objects for each specified method.
-Alternatively, the modelling results can be assess directly from the Analysis
object. Below shows the extraction of the explanatory features, using default parameters for each method, with the results returned in a single table.
-explanatory_features <- analysis %>%
- explanatoryFeatures()
-
-print(explanatory_features)
-#> # A tibble: 100 × 17
-#> Method Response Comparison Feature Metric Value estimate estimate1
-#> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
-#> 1 randomForest day 2~H N341 FalsePo… 8.06e-28 NA NA
-#> 2 randomForest day 2~H N377 FalsePo… 5.70e-18 NA NA
-#> 3 randomForest day 2~H N447 FalsePo… 5.70e-18 NA NA
-#> 4 randomForest day 2~H N579 FalsePo… 5.70e-18 NA NA
-#> 5 randomForest day 2~H N1084 FalsePo… 1.19e-16 NA NA
-#> 6 randomForest day 2~H N327 FalsePo… 2.33e-15 NA NA
-#> 7 randomForest day 2~H N580 FalsePo… 4.32e-14 NA NA
-#> 8 randomForest day 2~H N1083 FalsePo… 7.49e-13 NA NA
-#> 9 randomForest day 2~H N1085 FalsePo… 7.49e-13 NA NA
-#> 10 randomForest day 2~H N503 FalsePo… 7.49e-13 NA NA
-#> # … with 90 more rows, and 9 more variables: estimate2 <dbl>, statistic <dbl>,
-#> # p.value <dbl>, parameter <dbl>, conf.low <dbl>, conf.high <dbl>,
-#> # method <chr>, alternative <chr>, adjusted.p.value <dbl>
Heat maps of the explanatory features can also be plotted for both the modelling methods.
-
-plotExplanatoryHeatmap(analysis) %>%
- patchwork::wrap_plots()
vignettes/introduction.Rmd
- introduction.Rmd
The metabolyseR package provides a suite of methods that encompass three elements of metabolomics data analysis:
-The package also distinguishes between the flexibility and simplicity required for exploratory analyses compared to the convenience needed for more complex routine analyses. This is reflected in the underlying S4 object-oriented implementations and associated methods defined within the package. It should be noted that it is useful to understand the principles involved in using metabolyseR for exploratory analyses to aid in extracting and wrangling the results generated from routine analyses.
-The following document will provide an introduction to the basic usage of the package and includes how to create and use the base classes that are the foundation of metabolyseR. This will be focused around the applications for both exploratory and routine analyses. For more detailed information on the individual analysis elements see their associated vignette using:
-
-browseVignettes('metabolyseR')
There is also an example quick start analysis vignette provided.
-
-vignette('quick_start','metabolyseR')
Any issues, bugs or errors encountered while using the package should be reported here.
-The examples shown here will use the abr1
data set from the metaboData package (?metaboData::abr1
). This is a nominal mass flow-injection mass spectrometry (FI-MS) fingerprinting data set from a plant-pathogen infection time course experiment. The examples will also include use of the pipe %>%
from the magrittr package.
Firstly load the necessary packages:
- -The package supports parallel processing using the future package.
-By default, processing by metabolyseR
will be done sequentially. However, parallel processing can be activated, prior to analysis, by specifying a parallel back-end using plan()
. The following example specifies using the multisession
implementation (multiple background R sessions) with two worker processes.
-plan(future::multisession,workers = 2)
See the future package documentation for more information on the types of parallel implementations that are available.
-For exploratory analyses, simple questions of the data need to be answered quickly, requiring few steps. Key requirements for any tool used by investigators are that it should be both simple and flexible.
-In metabolyseR, the AnalysisData
class is the base S4 class that provides these requirements. The following sections will give an overview of the basics in constructing and using these objects as the base for analysis.
We can firstly construct an AnalysisData
object which requires two data tables. The first is the metabolomic data where the columns are the metabolome features, the rows the sample observations and contains the abundance values. The second is the sample meta-information where the row order should match to that of the metabolome data table. Using the example data, his can be constructed and assigned to the variable d
by:
-d <- analysisData(data = abr1$neg,
- info = abr1$fact)
Where abr1$neg
is the negative ionisation mode data and abr1$fact
is the corresponding sample information. By printing d
we can view some basic information about our data.
-print(d)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 2000
-## Info: 9
-We can also return the numbers of samples and numbers of features respectively using the following:
-
-nSamples(d)
## [1] 120
-
-nFeatures(d)
## [1] 2000
-The data table can be extracted using the dat
method:
-dat(d)
## # A tibble: 120 × 2,000
-## N1 N2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N12 N13
-## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 7 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 8 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 9 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 10 0 0 0 0 0 0 0 0 0 0 0 0 0
-## # … with 110 more rows, and 1,987 more variables: N14 <dbl>, N15 <dbl>,
-## # N16 <dbl>, N17 <dbl>, N18 <dbl>, N19 <dbl>, N20 <dbl>, N21 <dbl>,
-## # N22 <dbl>, N23 <dbl>, N24 <dbl>, N25 <dbl>, N26 <dbl>, N27 <dbl>,
-## # N28 <dbl>, N29 <dbl>, N30 <dbl>, N31 <dbl>, N32 <dbl>, N33 <dbl>,
-## # N34 <dbl>, N35 <dbl>, N36 <dbl>, N37 <dbl>, N38 <dbl>, N39 <dbl>,
-## # N40 <dbl>, N41 <dbl>, N42 <dbl>, N43 <dbl>, N44 <dbl>, N45 <dbl>,
-## # N46 <dbl>, N47 <dbl>, N48 <dbl>, N49 <dbl>, N50 <dbl>, N51 <dbl>, …
-Or alternatively, can be used to assign a new data table:
-
-dat(d) <- abr1$pos
-d
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 2000
-## Info: 9
-The sample information table can be extracted using the sinfo
method:
-sinfo(d)
## # A tibble: 120 × 9
-## injorder pathcdf filecdf name.org remark name rep day class
-## <int> <fct> <fct> <fct> <fct> <fct> <int> <fct> <int>
-## 1 1 C:/Xcalibur/ANDI-LT… 01.cdf 12_2 ok 12_2 2 2 2
-## 2 2 C:/Xcalibur/ANDI-LT… 02.cdf 13_3 ok 13_4 3 3 3
-## 3 3 C:/Xcalibur/ANDI-LT… 03.cdf 15_4 ok 15_5 5 4 4
-## 4 4 C:/Xcalibur/ANDI-LT… 04.cdf 12_1 ok 12_2 2 1 1
-## 5 5 C:/Xcalibur/ANDI-LT… 05.cdf 12_2 ok 12_2 2 2 2
-## 6 6 C:/Xcalibur/ANDI-LT… 06.cdf 11_1 ok 11_2 1 1 1
-## 7 7 C:/Xcalibur/ANDI-LT… 07.cdf 14_2 ok 14_3 4 2 2
-## 8 8 C:/Xcalibur/ANDI-LT… 08.cdf 11_4 ok 11_5 1 4 4
-## 9 9 C:/Xcalibur/ANDI-LT… 09.cdf 13_H ok 13_H 3 H 6
-## 10 10 C:/Xcalibur/ANDI-LT… 10.cdf 15_H ok 15_H 5 H 6
-## # … with 110 more rows
-And similarly used to assign a new sample information table:
-
-sinfo(d) <- abr1$fact[,1:2]
-d
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 2000
-## Info: 2
-There are a number of methods that provide utility for querying and altering the sample information within an AnalysisData
object. These methods are all named with the prefix cls
and include:
clsAdd
clsArrange
clsAvailable
clsExtract
clsRemove
clsRename
clsReplace
The names of the available sample information columns can be shown using clsAvailable()
.
-clsAvailable(d)
## [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name" "rep"
-## [8] "day" "class"
-A given column can be extracted using clsExtract()
. Here, the day
column is extracted.
-clsExtract(d,cls = 'day')
## [1] 2 3 4 1 2 1 2 4 H H 4 5 1 2 H 5 3 3 2 H 4 3 5 4 H H 3 H H 1 1 1 5 5 3 4 H
-## [38] 1 5 5 1 2 4 3 2 4 3 2 5 4 4 H 3 4 2 4 4 1 5 4 4 1 1 H 3 2 H 3 3 1 2 H H 2
-## [75] 3 5 3 2 5 2 4 3 H 2 3 2 1 1 4 5 3 2 1 H 5 2 4 H 1 4 4 1 1 5 H 5 1 3 3 5 5
-## [112] 5 3 2 5 H 5 H 2 1
-## Levels: 1 2 3 4 5 H
-Sample class frequencies could then be computed.
-
-clsExtract(d,cls = 'day') %>%
- table()
## .
-## 1 2 3 4 5 H
-## 20 20 20 20 20 20
-It can be seen that there are 20 samples available in each class.
-Another example is the addition of a new sample information column. In the following, a column called new_class
will be added with all samples labelled 1
.
-d <- clsAdd(d,cls = 'new_class',value = rep(1,nSamples(d)))
-clsAvailable(d)
## [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name"
-## [7] "rep" "day" "class" "new_class"
-Samples or features can easily be kept or removed from an AnalysisData
object as is most convenient.
Below can be seen the first 6 sample indexes in the injorder
column of the sample information.
-samples <- d %>%
- clsExtract(cls = 'injorder') %>%
- head()
-
-print(samples)
## [1] 1 2 3 4 5 6
-Only these samples could be kept using:
-
-d %>%
- keepSamples(idx = 'injorder',samples = samples)
##
-## AnalysisData object containing:
-##
-## Samples: 6
-## Features: 2000
-## Info: 10
-Or removed using:
-
-d %>%
- removeSamples(idx = 'injorder',samples = samples)
##
-## AnalysisData object containing:
-##
-## Samples: 114
-## Features: 2000
-## Info: 10
-The process is very similar for keeping or removing specific metabolome features from the data table. Below can be seen the first 6 feature names in the data table.
- -## [1] "N1" "N2" "N3" "N4" "N5" "N6"
-Only these features can be kept using:
-
-d %>%
- keepFeatures(features = feat)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 6
-## Info: 10
-Or to remove these features:
-
-d %>%
- removeFeatures(features = feat)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 1994
-## Info: 10
-Routine analyses are those that are often made up of numerous steps where parameters have likely already been previously established. The emphasis here is on convenience with as little code as possible required. In these analyses, the necessary analysis elements, order and parameters are first prepared and then the analysis routine subsequently performed in a single step. This section will introduce how this type of analysis can be performed using metabolyseR and will include four main topics:
-Parameter selection is the fundamental aspect for performing routine analyses using metabolyseR and will be the step requiring the most input from the user. The parameters for an analysis are stored in an S4 object of class AnalysisParameters
containing the relevant parameters of the selected analysis elements.
The parameters have been named so that they denote the same functionality commonly across all analysis element methods. Discussion of the specific parameters can be found withing the vignettes of the relevant analysis elements. These can be accessed using:
-
-browseVignettes('metabolyseR')
There are several ways to specify the parameters to use for analysis. The first is programatically and the second is through the use of the YAML format.
-The available analysis elements can be shown using:
- -## [1] "pre-treatment" "modelling" "correlations"
-The analysisParameters()
function can be used to create an AnalysisParameters
object containing the default parameters. For example, the code below will return default parameters for all the metabolyseR analysis elements.
-p <- analysisParameters()
-p
## Parameters:
-## pre-treatment
-## QC
-## occupancyFilter
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## impute
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## parallel = variables
-## seed = 1234
-## RSDfilter
-## cls = class
-## QCidx = QC
-## RSDthresh = 50
-## removeQC
-## cls = class
-## QCidx = QC
-## occupancyFilter
-## maximum
-## cls = class
-## occupancy = 2/3
-## impute
-## class
-## cls = class
-## occupancy = 2/3
-## seed = 1234
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = class
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-##
-## correlations
-## method = pearson
-## pAdjustMethod = bonferroni
-## corPvalue = 0.05
-To retrieve parameters for a subset of analysis elements the following can be run, returning parameters for only the pre-treatment and modelling elements.
-
-p <- analysisParameters(c('pre-treatment','modelling'))
-p
## Parameters:
-## pre-treatment
-## QC
-## occupancyFilter
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## impute
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## parallel = variables
-## seed = 1234
-## RSDfilter
-## cls = class
-## QCidx = QC
-## RSDthresh = 50
-## removeQC
-## cls = class
-## QCidx = QC
-## occupancyFilter
-## maximum
-## cls = class
-## occupancy = 2/3
-## impute
-## class
-## cls = class
-## occupancy = 2/3
-## seed = 1234
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = class
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-The changeParameter()
function can be used to uniformly change these parameters across all of the selected methods. The example below changes the defaults of all the parameters named cls
from the default class
to day
.
-p <- analysisParameters()
-changeParameter(p,'cls') <- 'day'
-p
## Parameters:
-## pre-treatment
-## QC
-## occupancyFilter
-## cls = day
-## QCidx = QC
-## occupancy = 2/3
-## impute
-## cls = day
-## QCidx = QC
-## occupancy = 2/3
-## parallel = variables
-## seed = 1234
-## RSDfilter
-## cls = day
-## QCidx = QC
-## RSDthresh = 50
-## removeQC
-## cls = day
-## QCidx = QC
-## occupancyFilter
-## maximum
-## cls = day
-## occupancy = 2/3
-## impute
-## class
-## cls = day
-## occupancy = 2/3
-## seed = 1234
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = day
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-##
-## correlations
-## method = pearson
-## pAdjustMethod = bonferroni
-## corPvalue = 0.05
-Alternatively the parameters of a specific analysis elements can be targeted using the elements
argument. The following will only alter the cls
parameter back to class
for the pre-treatment element parameters:
-changeParameter(p,'cls',elements = 'pre-treatment') <- 'class'
Parameters can be extracted from the AnalysisParameters
class using the parameters()
function for a specified element.
-parameters(p,'correlations')
## $method
-## [1] "pearson"
-##
-## $pAdjustMethod
-## [1] "bonferroni"
-##
-## $corPvalue
-## [1] 0.05
-Each analysis element has a function for returning default parameters for specific methods. These include preTreatmentParameters()
, modellingParameters()
and correlationParameters()
. Each returns a list of the default parameters for a specified methods as shown in the example for modellingParameters()
below.
-modellingParameters('anova')
## $anova
-## $anova$cls
-## [1] "class"
-##
-## $anova$pAdjust
-## [1] "bonferroni"
-##
-## $anova$comparisons
-## list()
-##
-## $anova$returnModels
-## [1] FALSE
-Refer to the documentation (?
) of each function for sepecific usage details.
The parameters returned by these functions can be assigned to an AnalysisParameters
object, again using parameters()
’
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(
- occupancyFilter = 'maximum',
- transform = 'TICnorm'
- )
- )
Due to the relatively complex structure of the parameters needed for analyses containing many components, it is also possible to specify analysis parameters using the YAML file format. YAML parameter files (.yaml) can be parsed using the parseParameters()
function. The example below shows the YAML specification for the defaults returned by analysisParameters()
.
pre-treatment:
- QC:
- occupancyFilter:
- cls: class
- QCidx: QC
- occupancy: 0.667
- impute:
- cls: class
- QCidx: QC
- occupancy: 0.667
- RSDfilter:
- cls: class
- QCidx: QC
- RSDthresh: 0.5
- removeQC:
- cls: class
- QCidx: QC
- occupancyFilter:
- maximum:
- cls: class
- occupancy: 0.667
- impute:
- class:
- cls: class
- occupancy: 0.667
- nCores: 4
- clusterType: FORK
- transform:
- TICnorm: ~
-classification:
- cls: class
- method: randomForest
- pars:
- sampling: boot
- niter: 10
- nreps: 10
- strat: yes
- nCores: 4
- clusterType: Fork
-featureSelection:
- method: fs.rf
- cls: class
- pars:
- fs.rf:
- nreps: 100
- nCores: 4
- clusterType: FORK
-correlations:
- method: pearson
- pAdjustMethod: bonferroni
- corPvalue: 0.05
This can be passed directly into an AnalysisParameters
object using the following:
-paramFile <- system.file('defaultParameters.yaml',package = 'metabolyseR')
-p <- parseParameters(paramFile)
For more complex pre-treatment situations such as the following:
-pre-treatment:
- remove:
- sample:
- idx: fileOrder
- samples: 1
- remove1:
- class:
- cls: day
- classes:
- - H
- - 1
- occupancyFilter:
- maximum:
- cls: class
- occupancy: 0.667
- transform:
- TICnorm: ~
Where multiple steps of the same method needed (here is remove
), these are numbered sequentially. Where multiple values also need to be provided to a particular argument (e.g. classes = c('H','1')
), these should be supplied as a hyphenated list.
Existing AnalysisParameters
objects can also be exported to YAML format as shown below:
-p <- analysisParameters()
-exportParameters(p,file = 'analysis_parameters.yaml')
The analysis is performed in a single step using the metabolyse()
function. This accepts the metabolomic data, the sample information and the analysis parameters.
The metabolomic data table of abundance values where the columns are the metabolome features and the rows are each sample observation. Similarly, the sample meta-information table should consist of the observations as rows and the meta information as columns. The order of the observation rows of the sample information table should be concordant with the rows in the metabolomics data table.
-We can run an example analysis using the abr1
data set by first generating the default parameters for pre-treatment and modelling (random forest) analysis elements.
-p <- analysisParameters(c('pre-treatment','modelling'))
Custom pre-treatment parameters can then be specified to only inlude occupancy filtering and total ion count normalisation.
-
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(
- occupancyFilter = 'maximum',
- transform = 'TICnorm')
-)
Next the cls
parameters can be changed to use the day
sample information column throughout the analysis.
-changeParameter(p,'cls') <- 'day'
Finally, the analysis can be run in a single step. Here only the fist 200 features of the negative ionisation mode data are specified to reduce the analysis time needed for this example.
-
-analysis <- metabolyse(abr1$neg[,1:200],abr1$fact,p)
##
-## metabolyseR v0.14.3 Tue Sep 14 11:36:36 2021
-## ________________________________________________________________________________
-## Parameters:
-## pre-treatment
-## occupancyFilter
-## maximum
-## cls = day
-## occupancy = 2/3
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = day
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-## ________________________________________________________________________________
-## Pre-treatment …
-
-Pre-treatment ✓ [0.8S]
-## Modelling …
-
-Modelling ✓ [3S]
-## ________________________________________________________________________________
-##
-## Complete! [3.8S]
-Note: If a data pre-treatment step is not performed prior to modelling or correlation analysis, the raw data will automatically be used.
-The analysis
object containing the analysis results can be printed to provide some basic information about the results of the analysis.
-print(analysis)
##
-## metabolyseR v0.14.3
-## Analysis:
-## Tue Sep 14 11:36:36 2021
-##
-## Raw Data:
-## No. samples = 120
-## No. features = 200
-##
-## Pre-treated Data:
-## Tue Sep 14 11:36:37 2021
-## No. samples = 120
-## No. features = 48
-##
-## Modelling:
-## Tue Sep 14 11:36:40 2021
-## Methods: randomForest
-There are likely to be occasions where an analysis will need to be re-analysed using a new set of parameters. This can be achieved using the reAnalyse()
function.
In the example below we will run a correlation analysis in addition to the pre-treatment and modelling elements already performed.
-Firstly, we can specify the correlation parameters:
-
-parameters <- analysisParameters('correlations')
Then perform the re-analysis on our previously analysed Analysis
object, specifying the additional parameters.
-analysis <- reAnalyse(analysis,parameters)
##
-## metabolyseR v0.14.3 Tue Sep 14 11:36:40 2021
-## ________________________________________________________________________________
-## Parameters:
-## correlations
-## method = pearson
-## pAdjustMethod = bonferroni
-## corPvalue = 0.05
-## ________________________________________________________________________________
-## Correlations …
-
-Correlations ✓ [0.1S]
-## ________________________________________________________________________________
-##
-## Complete! [0.1S]
-An overview of the results of the analysis (now including correlations) can then be printed.
-
-print(analysis)
##
-## metabolyseR v0.14.3
-## Analysis:
-## Tue Sep 14 11:36:36 2021
-##
-## Raw Data:
-## No. samples = 120
-## No. features = 200
-##
-## Pre-treated Data:
-## Tue Sep 14 11:36:37 2021
-## No. samples = 120
-## No. features = 48
-##
-## Modelling:
-## Tue Sep 14 11:36:40 2021
-## Methods: randomForest
-##
-## Correlations:
-## Tue Sep 14 11:36:40 2021
-## No. correlations = 140
-An analysis performed by metabolyse()
returns an S4 object of class Analysis
. There are a number of ways of extracting analysis results from this object.
Similarly to the AnalysisData
class, the dat()
and sinfo()
functions can be used to extract the metabolomics data or sample information tables directly for either the raw
or pre-treated
data.
For example, to extract the pre-treated metabolomics data from our object analysis
:
-dat(analysis,type = 'pre-treated')
## # A tibble: 120 × 48
-## N113 N115 N117 N118 N119 N127 N128 N129 N130 N131
-## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 0.00646 0 1.68e-4 0 1.60e-3 0.0323 2.65e-4 2.80e-4 0 0
-## 2 0.0113 7.74e-4 1.02e-3 0 1.43e-3 0.00856 0 3.95e-4 0 0
-## 3 0.00931 6.01e-4 2.70e-3 6.22e-5 5.58e-3 0 0 1.05e-4 0 6.51e-4
-## 4 0.00798 0 0 0 1.62e-4 0.00848 0 4.05e-4 0 1.28e-4
-## 5 0.0105 0 0 0 0 0.00658 0 1.97e-3 0 0
-## 6 0.00454 0 2.48e-4 3.25e-4 5.31e-4 0.00207 0 1.98e-4 0 0
-## 7 0.0117 0 1.14e-3 0 4.39e-4 0.00603 0 4.04e-4 0 0
-## 8 0.00787 2.36e-3 1.43e-3 1.52e-4 4.22e-3 0.00290 2.78e-4 5.76e-5 0 0
-## 9 0.00136 1.87e-4 8.17e-4 1.87e-4 0 0.0610 1.31e-4 5.23e-4 0 0
-## 10 0.00899 4.26e-4 2.06e-3 0 8.36e-4 0.00106 7.72e-4 0 0 0
-## # … with 110 more rows, and 38 more variables: N132 <dbl>, N133 <dbl>,
-## # N134 <dbl>, N135 <dbl>, N136 <dbl>, N137 <dbl>, N139 <dbl>, N143 <dbl>,
-## # N145 <dbl>, N146 <dbl>, N147 <dbl>, N149 <dbl>, N153 <dbl>, N155 <dbl>,
-## # N157 <dbl>, N161 <dbl>, N163 <dbl>, N164 <dbl>, N165 <dbl>, N168 <dbl>,
-## # N169 <dbl>, N170 <dbl>, N171 <dbl>, N173 <dbl>, N174 <dbl>, N175 <dbl>,
-## # N179 <dbl>, N180 <dbl>, N181 <dbl>, N183 <dbl>, N187 <dbl>, N191 <dbl>,
-## # N192 <dbl>, N193 <dbl>, N195 <dbl>, N196 <dbl>, N197 <dbl>, N198 <dbl>
-Or to extract the raw sample information:
-
-sinfo(analysis,type = 'raw')
## # A tibble: 120 × 9
-## injorder pathcdf filecdf name.org remark name rep day class
-## <int> <fct> <fct> <fct> <fct> <fct> <int> <fct> <int>
-## 1 1 C:/Xcalibur/ANDI-LT… 01.cdf 12_2 ok 12_2 2 2 2
-## 2 2 C:/Xcalibur/ANDI-LT… 02.cdf 13_3 ok 13_4 3 3 3
-## 3 3 C:/Xcalibur/ANDI-LT… 03.cdf 15_4 ok 15_5 5 4 4
-## 4 4 C:/Xcalibur/ANDI-LT… 04.cdf 12_1 ok 12_2 2 1 1
-## 5 5 C:/Xcalibur/ANDI-LT… 05.cdf 12_2 ok 12_2 2 2 2
-## 6 6 C:/Xcalibur/ANDI-LT… 06.cdf 11_1 ok 11_2 1 1 1
-## 7 7 C:/Xcalibur/ANDI-LT… 07.cdf 14_2 ok 14_3 4 2 2
-## 8 8 C:/Xcalibur/ANDI-LT… 08.cdf 11_4 ok 11_5 1 4 4
-## 9 9 C:/Xcalibur/ANDI-LT… 09.cdf 13_H ok 13_H 3 H 6
-## 10 10 C:/Xcalibur/ANDI-LT… 10.cdf 15_H ok 15_H 5 H 6
-## # … with 110 more rows
-Alternatively the raw
or preTreated
functions can be used to extract the AnalysisData
class objects containing both the metabolomics data and sample information for the raw and pre-treated data respectively.
-raw(analysis)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 200
-## Info: 9
-
-preTreated(analysis)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 48
-## Info: 9
-Lastly the analysisResults
function can be used to extract the results of any of the analysis elements. The following will extract the modelling results:
-analysisResults(analysis,element = 'modelling')
## $randomForest
-##
-## Random forest classification
-##
-## Samples: 120
-## Features: 48
-## Response: day
-## # comparisons: 1
-vignettes/metabolyseR.Rmd
- metabolyseR.Rmd
The metabolyseR package provides a suite of methods that encompass three elements of metabolomics data analysis:
-The package also distinguishes between the flexibility and simplicity required for exploratory analyses compared to the convenience needed for more complex routine analyses. This is reflected in the underlying S4 object-oriented implementations and associated methods defined within the package. It should be noted that it is useful to understand the principles involved in using metabolyseR for exploratory analyses to aid in extracting and wrangling the results generated from routine analyses.
-The following document will provide an introduction to the basic usage of the package and includes how to create and use the base classes that are the foundation of metabolyseR. This will be focused around the applications for both exploratory and routine analyses. For more detailed information on the individual analysis elements see their associated vignette using:
-
-browseVignettes('metabolyseR')
There is also an example quick start analysis vignette provided.
-
-vignette('quick_start','metabolyseR')
Any issues, bugs or errors encountered while using the package should be reported here.
-The examples shown here will use the abr1
data set from the metaboData package (?metaboData::abr1
). This is a nominal mass flow-injection mass spectrometry (FI-MS) fingerprinting data set from a plant-pathogen infection time course experiment. The examples will also include use of the pipe %>%
from the magrittr package.
Firstly load the necessary packages:
- -The package supports parallel processing using the future package.
-By default, processing by metabolyseR
will be done sequentially. However, parallel processing can be activated, prior to analysis, by specifying a parallel back-end using plan()
. The following example specifies using the multisession
implementation (multiple background R sessions) with two worker processes.
-plan(future::multisession,workers = 2)
See the future package documentation for more information on the types of parallel implementations that are available.
-For exploratory analyses, simple questions of the data need to be answered quickly, requiring few steps. Key requirements for any tool used by investigators are that it should be both simple and flexible.
-In metabolyseR, the AnalysisData
class is the base S4 class that provides these requirements. The following sections will give an overview of the basics in constructing and using these objects as the base for analysis.
We can firstly construct an AnalysisData
object which requires two data tables. The first is the metabolomic data where the columns are the metabolome features, the rows the sample observations and contains the abundance values. The second is the sample meta-information where the row order should match to that of the metabolome data table. Using the example data, his can be constructed and assigned to the variable d
by:
-d <- analysisData(data = abr1$neg,
- info = abr1$fact)
Where abr1$neg
is the negative ionisation mode data and abr1$fact
is the corresponding sample information. By printing d
we can view some basic information about our data.
-print(d)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 2000
-## Info: 9
-We can also return the numbers of samples and numbers of features respectively using the following:
-
-nSamples(d)
## [1] 120
-
-nFeatures(d)
## [1] 2000
-The data table can be extracted using the dat
method:
-dat(d)
## # A tibble: 120 × 2,000
-## N1 N2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N12 N13
-## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 7 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 8 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 9 0 0 0 0 0 0 0 0 0 0 0 0 0
-## 10 0 0 0 0 0 0 0 0 0 0 0 0 0
-## # … with 110 more rows, and 1,987 more variables: N14 <dbl>, N15 <dbl>,
-## # N16 <dbl>, N17 <dbl>, N18 <dbl>, N19 <dbl>, N20 <dbl>, N21 <dbl>,
-## # N22 <dbl>, N23 <dbl>, N24 <dbl>, N25 <dbl>, N26 <dbl>, N27 <dbl>,
-## # N28 <dbl>, N29 <dbl>, N30 <dbl>, N31 <dbl>, N32 <dbl>, N33 <dbl>,
-## # N34 <dbl>, N35 <dbl>, N36 <dbl>, N37 <dbl>, N38 <dbl>, N39 <dbl>,
-## # N40 <dbl>, N41 <dbl>, N42 <dbl>, N43 <dbl>, N44 <dbl>, N45 <dbl>,
-## # N46 <dbl>, N47 <dbl>, N48 <dbl>, N49 <dbl>, N50 <dbl>, N51 <dbl>, …
-Or alternatively, can be used to assign a new data table:
-
-dat(d) <- abr1$pos
-d
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 2000
-## Info: 9
-The sample information table can be extracted using the sinfo
method:
-sinfo(d)
## # A tibble: 120 × 9
-## injorder pathcdf filecdf name.org remark name rep day class
-## <int> <fct> <fct> <fct> <fct> <fct> <int> <fct> <int>
-## 1 1 C:/Xcalibur/ANDI-LT… 01.cdf 12_2 ok 12_2 2 2 2
-## 2 2 C:/Xcalibur/ANDI-LT… 02.cdf 13_3 ok 13_4 3 3 3
-## 3 3 C:/Xcalibur/ANDI-LT… 03.cdf 15_4 ok 15_5 5 4 4
-## 4 4 C:/Xcalibur/ANDI-LT… 04.cdf 12_1 ok 12_2 2 1 1
-## 5 5 C:/Xcalibur/ANDI-LT… 05.cdf 12_2 ok 12_2 2 2 2
-## 6 6 C:/Xcalibur/ANDI-LT… 06.cdf 11_1 ok 11_2 1 1 1
-## 7 7 C:/Xcalibur/ANDI-LT… 07.cdf 14_2 ok 14_3 4 2 2
-## 8 8 C:/Xcalibur/ANDI-LT… 08.cdf 11_4 ok 11_5 1 4 4
-## 9 9 C:/Xcalibur/ANDI-LT… 09.cdf 13_H ok 13_H 3 H 6
-## 10 10 C:/Xcalibur/ANDI-LT… 10.cdf 15_H ok 15_H 5 H 6
-## # … with 110 more rows
-And similarly used to assign a new sample information table:
-
-sinfo(d) <- abr1$fact[,1:2]
-d
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 2000
-## Info: 2
-There are a number of methods that provide utility for querying and altering the sample information within an AnalysisData
object. These methods are all named with the prefix cls
and include:
clsAdd
clsArrange
clsAvailable
clsExtract
clsRemove
clsRename
clsReplace
The names of the available sample information columns can be shown using clsAvailable()
.
-clsAvailable(d)
## [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name" "rep"
-## [8] "day" "class"
-A given column can be extracted using clsExtract()
. Here, the day
column is extracted.
-clsExtract(d,cls = 'day')
## [1] 2 3 4 1 2 1 2 4 H H 4 5 1 2 H 5 3 3 2 H 4 3 5 4 H H 3 H H 1 1 1 5 5 3 4 H
-## [38] 1 5 5 1 2 4 3 2 4 3 2 5 4 4 H 3 4 2 4 4 1 5 4 4 1 1 H 3 2 H 3 3 1 2 H H 2
-## [75] 3 5 3 2 5 2 4 3 H 2 3 2 1 1 4 5 3 2 1 H 5 2 4 H 1 4 4 1 1 5 H 5 1 3 3 5 5
-## [112] 5 3 2 5 H 5 H 2 1
-## Levels: 1 2 3 4 5 H
-Sample class frequencies could then be computed.
-
-clsExtract(d,cls = 'day') %>%
- table()
## .
-## 1 2 3 4 5 H
-## 20 20 20 20 20 20
-It can be seen that there are 20 samples available in each class.
-Another example is the addition of a new sample information column. In the following, a column called new_class
will be added with all samples labelled 1
.
-d <- clsAdd(d,cls = 'new_class',value = rep(1,nSamples(d)))
-clsAvailable(d)
## [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name"
-## [7] "rep" "day" "class" "new_class"
-Samples or features can easily be kept or removed from an AnalysisData
object as is most convenient.
Below can be seen the first 6 sample indexes in the injorder
column of the sample information.
-samples <- d %>%
- clsExtract(cls = 'injorder') %>%
- head()
-
-print(samples)
## [1] 1 2 3 4 5 6
-Only these samples could be kept using:
-
-d %>%
- keepSamples(idx = 'injorder',samples = samples)
##
-## AnalysisData object containing:
-##
-## Samples: 6
-## Features: 2000
-## Info: 10
-Or removed using:
-
-d %>%
- removeSamples(idx = 'injorder',samples = samples)
##
-## AnalysisData object containing:
-##
-## Samples: 114
-## Features: 2000
-## Info: 10
-The process is very similar for keeping or removing specific metabolome features from the data table. Below can be seen the first 6 feature names in the data table.
- -## [1] "N1" "N2" "N3" "N4" "N5" "N6"
-Only these features can be kept using:
-
-d %>%
- keepFeatures(features = feat)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 6
-## Info: 10
-Or to remove these features:
-
-d %>%
- removeFeatures(features = feat)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 1994
-## Info: 10
-Routine analyses are those that are often made up of numerous steps where parameters have likely already been previously established. The emphasis here is on convenience with as little code as possible required. In these analyses, the necessary analysis elements, order and parameters are first prepared and then the analysis routine subsequently performed in a single step. This section will introduce how this type of analysis can be performed using metabolyseR and will include four main topics:
-Parameter selection is the fundamental aspect for performing routine analyses using metabolyseR and will be the step requiring the most input from the user. The parameters for an analysis are stored in an S4 object of class AnalysisParameters
containing the relevant parameters of the selected analysis elements.
The parameters have been named so that they denote the same functionality commonly across all analysis element methods. Discussion of the specific parameters can be found withing the vignettes of the relevant analysis elements. These can be accessed using:
-
-browseVignettes('metabolyseR')
There are several ways to specify the parameters to use for analysis. The first is programatically and the second is through the use of the YAML format.
-The available analysis elements can be shown using:
- -## [1] "pre-treatment" "modelling" "correlations"
-The analysisParameters()
function can be used to create an AnalysisParameters
object containing the default parameters. For example, the code below will return default parameters for all the metabolyseR analysis elements.
-p <- analysisParameters()
-p
## Parameters:
-## pre-treatment
-## QC
-## occupancyFilter
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## impute
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## parallel = variables
-## seed = 1234
-## RSDfilter
-## cls = class
-## QCidx = QC
-## RSDthresh = 50
-## removeQC
-## cls = class
-## QCidx = QC
-## occupancyFilter
-## maximum
-## cls = class
-## occupancy = 2/3
-## impute
-## class
-## cls = class
-## occupancy = 2/3
-## seed = 1234
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = class
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-##
-## correlations
-## method = pearson
-## pAdjustMethod = bonferroni
-## corPvalue = 0.05
-To retrieve parameters for a subset of analysis elements the following can be run, returning parameters for only the pre-treatment and modelling elements.
-
-p <- analysisParameters(c('pre-treatment','modelling'))
-p
## Parameters:
-## pre-treatment
-## QC
-## occupancyFilter
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## impute
-## cls = class
-## QCidx = QC
-## occupancy = 2/3
-## parallel = variables
-## seed = 1234
-## RSDfilter
-## cls = class
-## QCidx = QC
-## RSDthresh = 50
-## removeQC
-## cls = class
-## QCidx = QC
-## occupancyFilter
-## maximum
-## cls = class
-## occupancy = 2/3
-## impute
-## class
-## cls = class
-## occupancy = 2/3
-## seed = 1234
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = class
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-The changeParameter()
function can be used to uniformly change these parameters across all of the selected methods. The example below changes the defaults of all the parameters named cls
from the default class
to day
.
-p <- analysisParameters()
-changeParameter(p,'cls') <- 'day'
-p
## Parameters:
-## pre-treatment
-## QC
-## occupancyFilter
-## cls = day
-## QCidx = QC
-## occupancy = 2/3
-## impute
-## cls = day
-## QCidx = QC
-## occupancy = 2/3
-## parallel = variables
-## seed = 1234
-## RSDfilter
-## cls = day
-## QCidx = QC
-## RSDthresh = 50
-## removeQC
-## cls = day
-## QCidx = QC
-## occupancyFilter
-## maximum
-## cls = day
-## occupancy = 2/3
-## impute
-## class
-## cls = day
-## occupancy = 2/3
-## seed = 1234
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = day
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-##
-## correlations
-## method = pearson
-## pAdjustMethod = bonferroni
-## corPvalue = 0.05
-Alternatively the parameters of a specific analysis elements can be targeted using the elements
argument. The following will only alter the cls
parameter back to class
for the pre-treatment element parameters:
-changeParameter(p,'cls',elements = 'pre-treatment') <- 'class'
Parameters can be extracted from the AnalysisParameters
class using the parameters()
function for a specified element.
-parameters(p,'correlations')
## $method
-## [1] "pearson"
-##
-## $pAdjustMethod
-## [1] "bonferroni"
-##
-## $corPvalue
-## [1] 0.05
-Each analysis element has a function for returning default parameters for specific methods. These include preTreatmentParameters()
, modellingParameters()
and correlationParameters()
. Each returns a list of the default parameters for a specified methods as shown in the example for modellingParameters()
below.
-modellingParameters('anova')
## $anova
-## $anova$cls
-## [1] "class"
-##
-## $anova$pAdjust
-## [1] "bonferroni"
-##
-## $anova$comparisons
-## list()
-##
-## $anova$returnModels
-## [1] FALSE
-Refer to the documentation (?
) of each function for sepecific usage details.
The parameters returned by these functions can be assigned to an AnalysisParameters
object, again using parameters()
’
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(
- occupancyFilter = 'maximum',
- transform = 'TICnorm'
- )
- )
Due to the relatively complex structure of the parameters needed for analyses containing many components, it is also possible to specify analysis parameters using the YAML file format. YAML parameter files (.yaml) can be parsed using the parseParameters()
function. The example below shows the YAML specification for the defaults returned by analysisParameters()
.
pre-treatment:
- QC:
- occupancyFilter:
- cls: class
- QCidx: QC
- occupancy: 0.667
- impute:
- cls: class
- QCidx: QC
- occupancy: 0.667
- RSDfilter:
- cls: class
- QCidx: QC
- RSDthresh: 0.5
- removeQC:
- cls: class
- QCidx: QC
- occupancyFilter:
- maximum:
- cls: class
- occupancy: 0.667
- impute:
- class:
- cls: class
- occupancy: 0.667
- transform:
- TICnorm: ~
-classification:
- cls: class
- method: randomForest
- pars:
- sampling: boot
- niter: 10
- nreps: 10
- strat: yes
-featureSelection:
- method: fs.rf
- cls: class
- pars:
- fs.rf:
- nreps: 100
-correlations:
- method: pearson
- pAdjustMethod: bonferroni
- corPvalue: 0.05
This can be passed directly into an AnalysisParameters
object using the following:
-paramFile <- system.file('defaultParameters.yaml',package = 'metabolyseR')
-p <- parseParameters(paramFile)
For more complex pre-treatment situations such as the following:
-pre-treatment:
- remove:
- sample:
- idx: fileOrder
- samples: 1
- remove1:
- class:
- cls: day
- classes:
- - H
- - 1
- occupancyFilter:
- maximum:
- cls: class
- occupancy: 0.667
- transform:
- TICnorm: ~
Where multiple steps of the same method needed (here is remove
), these are numbered sequentially. Where multiple values also need to be provided to a particular argument (e.g. classes = c('H','1')
), these should be supplied as a hyphenated list.
Existing AnalysisParameters
objects can also be exported to YAML format as shown below:
-p <- analysisParameters()
-exportParameters(p,file = 'analysis_parameters.yaml')
The analysis is performed in a single step using the metabolyse()
function. This accepts the metabolomic data, the sample information and the analysis parameters.
The metabolomic data table of abundance values where the columns are the metabolome features and the rows are each sample observation. Similarly, the sample meta-information table should consist of the observations as rows and the meta information as columns. The order of the observation rows of the sample information table should be concordant with the rows in the metabolomics data table.
-We can run an example analysis using the abr1
data set by first generating the default parameters for pre-treatment and modelling (random forest) analysis elements.
-p <- analysisParameters(c('pre-treatment','modelling'))
Custom pre-treatment parameters can then be specified to only inlude occupancy filtering and total ion count normalisation.
-
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(
- occupancyFilter = 'maximum',
- transform = 'TICnorm')
-)
Next the cls
parameters can be changed to use the day
sample information column throughout the analysis.
-changeParameter(p,'cls') <- 'day'
Finally, the analysis can be run in a single step. Here only the fist 200 features of the negative ionisation mode data are specified to reduce the analysis time needed for this example.
-
-analysis <- metabolyse(abr1$neg[,1:200],abr1$fact,p)
##
-## metabolyseR v0.14.9 Thu Jan 27 11:59:17 2022
-## ________________________________________________________________________________
-## Parameters:
-## pre-treatment
-## occupancyFilter
-## maximum
-## cls = day
-## occupancy = 2/3
-## transform
-## TICnorm
-##
-## modelling
-## randomForest
-## cls = day
-## rf = list()
-## reps = 1
-## binary = FALSE
-## comparisons = list()
-## perm = 0
-## returnModels = FALSE
-## seed = 1234
-## ________________________________________________________________________________
-##
[34mPre-treatment
[39m…
-
-
[34mPre-treatment
[39m
[32m✓
[39m [0.7S]
-##
[34mModelling
[39m…
-
[34m
-Modelling
[39m
[32m✓
[39m [2.4S]
-## ________________________________________________________________________________
-##
-##
[32mComplete!
[39m[3.1S]
-Note: If a data pre-treatment step is not performed prior to modelling or correlation analysis, the raw data will automatically be used.
-The analysis
object containing the analysis results can be printed to provide some basic information about the results of the analysis.
-print(analysis)
##
-## metabolyseR v0.14.9
-## Analysis:
-## Thu Jan 27 11:59:17 2022
-##
-## Raw Data:
-## No. samples = 120
-## No. features = 200
-##
-## Pre-treated Data:
-## Thu Jan 27 11:59:18 2022
-## No. samples = 120
-## No. features = 48
-##
-## Modelling:
-## Thu Jan 27 11:59:20 2022
-## Methods: randomForest
-There are likely to be occasions where an analysis will need to be re-analysed using a new set of parameters. This can be achieved using the reAnalyse()
function.
In the example below we will run a correlation analysis in addition to the pre-treatment and modelling elements already performed.
-Firstly, we can specify the correlation parameters:
-
-parameters <- analysisParameters('correlations')
Then perform the re-analysis on our previously analysed Analysis
object, specifying the additional parameters.
-analysis <- reAnalyse(analysis,parameters)
##
-## metabolyseR v0.14.9 Thu Jan 27 11:59:21 2022
-## ________________________________________________________________________________
-## Parameters:
-## correlations
-## method = pearson
-## pAdjustMethod = bonferroni
-## corPvalue = 0.05
-## ________________________________________________________________________________
-##
[34mCorrelations
[39m…
-
[34m
-Correlations
[39m
[32m✓
[39m [0.1S]
-## ________________________________________________________________________________
-##
-## Complete! [0.1S]
-An overview of the results of the analysis (now including correlations) can then be printed.
-
-print(analysis)
##
-## metabolyseR v0.14.9
-## Analysis:
-## Thu Jan 27 11:59:17 2022
-##
-## Raw Data:
-## No. samples = 120
-## No. features = 200
-##
-## Pre-treated Data:
-## Thu Jan 27 11:59:18 2022
-## No. samples = 120
-## No. features = 48
-##
-## Modelling:
-## Thu Jan 27 11:59:20 2022
-## Methods: randomForest
-##
-## Correlations:
-## Thu Jan 27 11:59:21 2022
-## No. correlations = 140
-An analysis performed by metabolyse()
returns an S4 object of class Analysis
. There are a number of ways of extracting analysis results from this object.
Similarly to the AnalysisData
class, the dat()
and sinfo()
functions can be used to extract the metabolomics data or sample information tables directly for either the raw
or pre-treated
data.
For example, to extract the pre-treated metabolomics data from our object analysis
:
-dat(analysis,type = 'pre-treated')
## # A tibble: 120 × 48
-## N113 N115 N117 N118 N119 N127 N128 N129 N130 N131
-## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 0.00646 0 1.68e-4 0 1.60e-3 0.0323 2.65e-4 2.80e-4 0 0
-## 2 0.0113 7.74e-4 1.02e-3 0 1.43e-3 0.00856 0 3.95e-4 0 0
-## 3 0.00931 6.01e-4 2.70e-3 6.22e-5 5.58e-3 0 0 1.05e-4 0 6.51e-4
-## 4 0.00798 0 0 0 1.62e-4 0.00848 0 4.05e-4 0 1.28e-4
-## 5 0.0105 0 0 0 0 0.00658 0 1.97e-3 0 0
-## 6 0.00454 0 2.48e-4 3.25e-4 5.31e-4 0.00207 0 1.98e-4 0 0
-## 7 0.0117 0 1.14e-3 0 4.39e-4 0.00603 0 4.04e-4 0 0
-## 8 0.00787 2.36e-3 1.43e-3 1.52e-4 4.22e-3 0.00290 2.78e-4 5.76e-5 0 0
-## 9 0.00136 1.87e-4 8.17e-4 1.87e-4 0 0.0610 1.31e-4 5.23e-4 0 0
-## 10 0.00899 4.26e-4 2.06e-3 0 8.36e-4 0.00106 7.72e-4 0 0 0
-## # … with 110 more rows, and 38 more variables: N132 <dbl>, N133 <dbl>,
-## # N134 <dbl>, N135 <dbl>, N136 <dbl>, N137 <dbl>, N139 <dbl>, N143 <dbl>,
-## # N145 <dbl>, N146 <dbl>, N147 <dbl>, N149 <dbl>, N153 <dbl>, N155 <dbl>,
-## # N157 <dbl>, N161 <dbl>, N163 <dbl>, N164 <dbl>, N165 <dbl>, N168 <dbl>,
-## # N169 <dbl>, N170 <dbl>, N171 <dbl>, N173 <dbl>, N174 <dbl>, N175 <dbl>,
-## # N179 <dbl>, N180 <dbl>, N181 <dbl>, N183 <dbl>, N187 <dbl>, N191 <dbl>,
-## # N192 <dbl>, N193 <dbl>, N195 <dbl>, N196 <dbl>, N197 <dbl>, N198 <dbl>
-Or to extract the raw sample information:
-
-sinfo(analysis,type = 'raw')
## # A tibble: 120 × 9
-## injorder pathcdf filecdf name.org remark name rep day class
-## <int> <fct> <fct> <fct> <fct> <fct> <int> <fct> <int>
-## 1 1 C:/Xcalibur/ANDI-LT… 01.cdf 12_2 ok 12_2 2 2 2
-## 2 2 C:/Xcalibur/ANDI-LT… 02.cdf 13_3 ok 13_4 3 3 3
-## 3 3 C:/Xcalibur/ANDI-LT… 03.cdf 15_4 ok 15_5 5 4 4
-## 4 4 C:/Xcalibur/ANDI-LT… 04.cdf 12_1 ok 12_2 2 1 1
-## 5 5 C:/Xcalibur/ANDI-LT… 05.cdf 12_2 ok 12_2 2 2 2
-## 6 6 C:/Xcalibur/ANDI-LT… 06.cdf 11_1 ok 11_2 1 1 1
-## 7 7 C:/Xcalibur/ANDI-LT… 07.cdf 14_2 ok 14_3 4 2 2
-## 8 8 C:/Xcalibur/ANDI-LT… 08.cdf 11_4 ok 11_5 1 4 4
-## 9 9 C:/Xcalibur/ANDI-LT… 09.cdf 13_H ok 13_H 3 H 6
-## 10 10 C:/Xcalibur/ANDI-LT… 10.cdf 15_H ok 15_H 5 H 6
-## # … with 110 more rows
-Alternatively the raw
or preTreated
functions can be used to extract the AnalysisData
class objects containing both the metabolomics data and sample information for the raw and pre-treated data respectively.
-raw(analysis)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 200
-## Info: 9
-
-preTreated(analysis)
##
-## AnalysisData object containing:
-##
-## Samples: 120
-## Features: 48
-## Info: 9
-Lastly the analysisResults
function can be used to extract the results of any of the analysis elements. The following will extract the modelling results:
-analysisResults(analysis,element = 'modelling')
## $randomForest
-##
-## Random forest classification
-##
-## Samples: 120
-## Features: 48
-## Response: day
-## # comparisons: 1
-vignettes/modelling.Rmd
- modelling.Rmd
Modelling provides the essential data mining step for extracting biological information and explanatory metabolome features from a data set relating to the experimental conditions. metabolyseR
provides a number of both univariate and multivariate methods for data mining.
For an introduction to the usage of metabolyseR for both exploratory and routine analyses, see the introduction vignette using:
-
-vignette('introduction','metabolyseR')
To further supplement this document, a quick start example analysis is also available as a vignette:
-
-vignette('quick_start','metabolyseR')
To begin, the package can be loaded using:
-
-library(metabolyseR)
-#>
-#> Attaching package: 'metabolyseR'
-#> The following object is masked from 'package:stats':
-#>
-#> anova
-#> The following objects are masked from 'package:base':
-#>
-#> raw, split
The examples used here will use the abr1
data set from the metaboData package. This is nominal mass flow-injection mass spectrometry (FI-MS) fingerprinting data from a plant-pathogen infection time course experiment. The pipe %>%
from the magrittr package will also be used. The example data can be loaded using:
Only the negative acquisition mode data (abr1$neg
) will be used along with the sample meta-information (abr1$fact
). Create an AnalysisData
class object, assigned to the variable d
, using the following:
-d <- analysisData(abr1$neg[,1:500],abr1$fact)
-print(d)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 500
-#> Info: 9
As can be seen above the data set contains a total of 120 samples and 500 features.
-The package supports parallel processing using the future package.
-By default, processing by metabolyseR
will be done seqentially. However, parallel processing can be activated, prior to analysis, by specifying a parallel implementation using plan()
. The following example specifies using the multisession
implementation (muliple background R sessions) with two worker processes.
-plan(future::multisession,workers = 2)
See the future package documentation for more information on the types of parallel implementations that are available.
-Random forest is a versatile ensemble machine learning approach based on forests of decision trees for multivariate data mining. This can include unsupervised analysis, classification of discrete response variables and regression of continuous responses.
-Random forest can be performed in metabolyseR
using the randomForest()
method. For further details on the arguments for using this function, see ?randomForest
. This implementation of random forest in metabolyseR
utilises the randomForest
package. See ?randomForest::randomForest
for more information about that implementation.
The unsupervised random forest approach can be useful starting point for analysis in any experimental context. It can be used to give a general overview of the structure of the data and to identify any possible problems. These could include situations such as the presence of outliers samples or splits in the data caused by the impact of analytical or sample preparation factors. Unsupervised random forest can have advantages in these assessments over other approaches such as Principle Component Analysis (PCA). It is less sensitive to the effect of a single feature that in fact could have little overall impact relative to the other hundreds that could be present in a data set.
-The examples below will show the use of unsupervised random forest for assessing the general structure of the example data set and the presence of outlier samples.
-Unsupervised random forest can be performed by setting the cls
argument of randomForest()
to NULL
:
-unsupervised_rf <- d %>%
- randomForest(cls = NULL)
The type of random forest that has been performed can be checked using the type
method.
-type(unsupervised_rf)
-#> [1] "unsupervised"
Or by printing the results object.
-
-unsupervised_rf
-#>
-#> Unsupervised random forest
-#>
-#> Samples: 120
-#> Features: 500
Firstly, the presence of outlier samples will be assessed. A multidimensional scaling (MDS) plot can be used to visualise the relative proximity of the observations, as shown in the following. The individual points are also labelled by their injection order to enable the identification of individual samples if necessary.
-
-plotMDS(unsupervised_rf,
- cls = NULL,
- label = 'injorder',
- labelSize = 3,
- title = 'Outlier detection')
-#> Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
-#> increasing max.overlaps
From the plot above, it can be seen a single sample lies outside the 95% confidence ellipse. It is unlikely that this sample can be considered an outlier as it’s position is as a result of the underlying class structure as opposed to differences specific to that individual sample.
-The structure of these observations can be investigated further by colouring the points by a different experimental factor. This will be by the day
class column which is the main experimental factor of interest in this experiment.
-plotMDS(unsupervised_rf,
- cls = 'day')
This shows that it is indeed the experimental factor of interest that is having the greatest impact on the structure of the data. The progression of the experimental time points are obvious across Dimension 1.
-The available feature importance metrics for a random forest analysis can be retrieved by:
-
-importanceMetrics(unsupervised_rf)
-#> [1] "1" "2" "FalsePositiveRate"
-#> [4] "MeanDecreaseAccuracy" "MeanDecreaseGini" "SelectionFrequency"
And the importance values of these metrics for each feature can returned using:
-
-importance(unsupervised_rf)
-#> # A tibble: 3,000 × 3
-#> Feature Metric Value
-#> <chr> <chr> <dbl>
-#> 1 N1 1 0
-#> 2 N1 2 0
-#> 3 N1 FalsePositiveRate 0.0238
-#> 4 N1 MeanDecreaseAccuracy 0
-#> 5 N1 MeanDecreaseGini 0
-#> 6 N1 SelectionFrequency 0
-#> 7 N10 1 0
-#> 8 N10 2 0
-#> 9 N10 FalsePositiveRate 0.0238
-#> 10 N10 MeanDecreaseAccuracy 0
-#> # … with 2,990 more rows
The explanatory features for a given threshold can be extracted for any of the importance metrics. The following will extract the explanatory features below a threshold of 0.05 based on the false positive rate metric.
-
-unsupervised_rf %>%
- explanatoryFeatures(metric = "FalsePositiveRate",
- threshold = 0.05)
-#> # A tibble: 359 × 3
-#> Feature Metric Value
-#> <chr> <chr> <dbl>
-#> 1 N342 FalsePositiveRate 1.31e-19
-#> 2 N161 FalsePositiveRate 2.34e-16
-#> 3 N341 FalsePositiveRate 6.50e-16
-#> 4 N315 FalsePositiveRate 1.79e-15
-#> 5 N367 FalsePositiveRate 3.47e-14
-#> 6 N173 FalsePositiveRate 9.09e-14
-#> 7 N385 FalsePositiveRate 9.09e-14
-#> 8 N133 FalsePositiveRate 1.52e-12
-#> 9 N439 FalsePositiveRate 1.52e-12
-#> 10 N379 FalsePositiveRate 3.78e-12
-#> # … with 349 more rows
In this example there are 359 explanatory features.
-The trend of the most highly ranked explanatory feature against the day
factor can be plotted using the plotFeature()
method.
-unsupervised_rf %>%
- plotFeature(feature = 'N425',
- cls = 'day')
Random forest classification can be used to assess the extent of discrimination (difference) between classes of a discrete response variable. This includes both multinomial (number of classes > 2) and binary (number of classes = 2) comparisons.
-In multinomial situations, the suitability of a multinomial comparison versus multiple binary comparisons can depend on the experimental context. For instance, in a treatment/control experiment that includes multiple time points, a multinomial comparison using all available classes could be useful to visualise the general structure of the data. However, it could make any extracted explanatory features difficult to reason about as to how they relate to the individual experimental time point or treatment conditions. An investigator could instead identify the binary comparisons relevant to the biological question and focus the further classification comparisons to better select for explanatory features.
-In experiments with more than two classes, multinomial random forest classification can be used to assess the discrimination between the classes and give an overview of the relative structure between classes.
-The example data set consists of a total of 6 classes for the day
response variable.
-d %>%
- clsExtract(cls = 'day') %>%
- unique()
-#> [1] 2 3 4 1 H 5
-#> Levels: 1 2 3 4 5 H
Multinomial classification can be performed by:
-
-multinomial_rf <- d %>%
- randomForest(cls = 'day')
-
-print(multinomial_rf)
-#>
-#> Random forest classification
-#>
-#> Samples: 120
-#> Features: 500
-#> Response: day
-#> # comparisons: 1
The performance of this model can be assessed using metrics based on the success of the out of bag (OOB) predictions. The performance metrics can be extracted using:
-
-multinomial_rf %>%
- metrics()
-#> # A tibble: 4 × 5
-#> Response Comparison .metric .estimator .estimate
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 day 1~2~3~4~5~H accuracy multiclass 0.8
-#> 2 day 1~2~3~4~5~H kap multiclass 0.76
-#> 3 day 1~2~3~4~5~H roc_auc hand_till 0.964
-#> 4 day 1~2~3~4~5~H margin NA 0.146
These metrics include accuracy, Cohen’s kappa (kap), area under the receiver operator characteristic curve (roc_auc, ROC-AUC) and margin. Each metric has both strengths and weaknesses that depend on the context of the classification such as the balance of observations between the classes. As shown below, the class frequencies for this example are balanced with 20 observations per class.
-
-d %>%
- clsExtract(cls = 'day') %>%
- table()
-#> .
-#> 1 2 3 4 5 H
-#> 20 20 20 20 20 20
In this context, each of these metrics could be used to assess the predictive performance of the model. The margin metric is the difference between the proportion of votes for the correct class and the maximum proportion of votes for the other classes for a given observation which is then averaged across all the observations. A positive margin value indicates correct classification and values greater than 0.2 can be considered as the models having strong predictive power. The margin also allows the extent of discrimination to be discerned even in very distinct cases above where both the accuracy and ROC-AUC would be registering values of 1.
-In this example, the values of all the metrics suggest that the model is showing good predictive performance. This can be investigated further by plotting the MDS of observation proximity values.
- - -This shows that the model is able to discriminate highly between classes such as 5
and H
. It is less able to discriminate more similar classes such as H
and 1
or 4
and 5
whose confidence ellipses show a high degree of overlap. This makes sense in the context of this experiment as these are adjacent time points that are more likely to be similar than time points at each end of the experiment.
The ROC curves can also be plotted as shown below.
- - -Classes with their line further from the central dashed line are those that were predicted with the greatest reliability by the model. This plot shows that both the H
and 1
classes were least reliably predicted which is a result of their close proximity shown in the MDS plot previously.
Importance metrics can be used to identify the metabolome features that contribute most to the class discrimination in the model. The available importance metrics for this model are shown below.
-
-importanceMetrics(multinomial_rf)
-#> [1] "1" "2" "3"
-#> [4] "4" "5" "FalsePositiveRate"
-#> [7] "H" "MeanDecreaseAccuracy" "MeanDecreaseGini"
-#> [10] "SelectionFrequency"
Here, we will use the false positive rate metric with a threshold of below 0.05 to identify explanatory features for the day
response variable.
-multinomial_rf %>%
- explanatoryFeatures(metric = 'FalsePositiveRate',
- threshold = 0.05)
-#> # A tibble: 121 × 5
-#> Response Comparison Feature Metric Value
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 day 1~2~3~4~5~H N341 FalsePositiveRate 1.02e-93
-#> 2 day 1~2~3~4~5~H N133 FalsePositiveRate 7.38e-68
-#> 3 day 1~2~3~4~5~H N163 FalsePositiveRate 3.59e-61
-#> 4 day 1~2~3~4~5~H N439 FalsePositiveRate 1.07e-54
-#> 5 day 1~2~3~4~5~H N342 FalsePositiveRate 3.19e-49
-#> 6 day 1~2~3~4~5~H N377 FalsePositiveRate 3.19e-49
-#> 7 day 1~2~3~4~5~H N171 FalsePositiveRate 6.26e-44
-#> 8 day 1~2~3~4~5~H N497 FalsePositiveRate 6.11e-30
-#> 9 day 1~2~3~4~5~H N146 FalsePositiveRate 2.74e-29
-#> 10 day 1~2~3~4~5~H N195 FalsePositiveRate 7.16e-25
-#> # … with 111 more rows
As shown above there were a total of 121 explanatory features identified.
-Within a multinomial experiment, it is also possible to specify the exact class comparisons to include, where it might not be suitable to compare all the classes at once using the comparisons
argument. This should be specified as a named list, the corresponding to the cls
argument. Each named element should then consist of a vector of comparisons, the classes to compare separated using the ~
.
The following specifies two comparisons (H~1~2
,H~1~5
) for the day
response variable and displays the performance metrics.
-d %>%
- randomForest(cls = 'day',
- comparisons = list(day = c('H~1~2',
- 'H~1~5'))) %>%
- metrics()
-#> # A tibble: 8 × 5
-#> Response Comparison .metric .estimator .estimate
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 day H~1~2 accuracy multiclass 0.833
-#> 2 day H~1~2 kap multiclass 0.75
-#> 3 day H~1~5 accuracy multiclass 0.75
-#> 4 day H~1~5 kap multiclass 0.625
-#> 5 day H~1~2 roc_auc hand_till 0.906
-#> 6 day H~1~5 roc_auc hand_till 0.909
-#> 7 day H~1~2 margin NA 0.172
-#> 8 day H~1~5 margin NA 0.320
The MDS and ROC curve plots can also be plotted simultaneously for the two comparisons.
-
-d %>%
- randomForest(cls = 'day',
- comparisons = list(day = c('H~1~2',
- 'H~1~5'))) %>%
- {plotMDS(.,cls = 'day') +
- plotROC(.) +
- patchwork::plot_layout(ncol = 1)}
Similarly, it is also possible to model multiple response factors with a single random forest call by specifying a vector of response class information column names to the cls
argument. In the following, both the name
and day
response factors will be analysed and the performance metrics returned in a single table.
-d %>%
- randomForest(cls = c('name','day')) %>%
- metrics()
-#> Warning: Classes with < 5 replicates removed: "11_3", "11_4", "11_5", "11_6",
-#> "11_H", "12_1", "12_3", "12_6", "12_H", "13_1", "13_2", "13_3", "13_5", "13_6",
-#> "13_H", "14_2", "14_3", "14_5", "14_6", "14_H", "15_1", "15_2", "15_4", "15_5",
-#> "15_6", "15_H"
-#> Unbalanced classes detected. Stratifying sample size to the smallest class size.
-#> # A tibble: 8 × 5
-#> Response Comparison .metric .estimator .estimate
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 name 11_2~12_2~12_4~13_4~14_4~15_3 accuracy multiclass 0.35
-#> 2 name 11_2~12_2~12_4~13_4~14_4~15_3 kap multiclass 0.212
-#> 3 name 11_2~12_2~12_4~13_4~14_4~15_3 roc_auc hand_till 0.753
-#> 4 name 11_2~12_2~12_4~13_4~14_4~15_3 margin NA -0.0485
-#> 5 day 1~2~3~4~5~H accuracy multiclass 0.8
-#> 6 day 1~2~3~4~5~H kap multiclass 0.76
-#> 7 day 1~2~3~4~5~H roc_auc hand_till 0.964
-#> 8 day 1~2~3~4~5~H margin NA 0.146
The MDS plots can also be returned for both models simultaneously.
-
-d %>%
- randomForest(cls = c('name','day')) %>%
- plotMDS()
-#> Warning: Classes with < 5 replicates removed: "11_3", "11_4", "11_5", "11_6",
-#> "11_H", "12_1", "12_3", "12_6", "12_H", "13_1", "13_2", "13_3", "13_5", "13_6",
-#> "13_H", "14_2", "14_3", "14_5", "14_6", "14_H", "15_1", "15_2", "15_4", "15_5",
-#> "15_6", "15_H"
-#> Unbalanced classes detected. Stratifying sample size to the smallest class size.
It may in some cases be preferable to analyse class comparisons as multiple binary comparisons.
-The possible binary comparisons for a given response variable can be displayed using the binaryComparisons()
method. Below shows the 15 comparisons for the day
response variable.
-binaryComparisons(d,cls = 'day')
-#> [1] "1~2" "1~3" "1~4" "1~5" "1~H" "2~3" "2~4" "2~5" "2~H" "3~4" "3~5" "3~H"
-#> [13] "4~5" "4~H" "5~H"
For this example we will only use the binary comparisons containing the H
class.
-binary_comparisons <- binaryComparisons(d,cls = 'day') %>%
- .[stringr::str_detect(.,'H')]
The binary comparisons can then be performed using the following.
-
-binary_rf <- d %>%
- randomForest(cls = 'day',
- comparisons = list(day = binary_comparisons))
-
-print(binary_rf)
-#>
-#> Random forest classification
-#>
-#> Samples: 120
-#> Features: 500
-#> Response: day
-#> # comparisons: 5
To run all possible binary comparisons, the binary = TRUE
argument could instead be used.
The MDS plots for each comparison can be visualised to inspect the comparisons.
- - -These plots show good separation in all the comparisons except H~1
which is also shown by the plot of the performance metrics below. Each of the comparisons are showing perfect performance for the accuracy, Cohen’s kappa and ROC-AUC metrics as well as very high margin values except for the H~1
comparison.
-binary_rf %>%
- plotMetrics()
The explanatory features for these comparisons can be extracted as below using the false positive rate metric and a cut-off threshold of 0.05. This gives a total of 251 explanatory features.
-
-binary_rf %>%
- explanatoryFeatures(metric = 'FalsePositiveRate',
- threshold = 0.05)
-#> # A tibble: 251 × 5
-#> Response Comparison Feature Metric Value
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 day 2~H N341 FalsePositiveRate 7.34e-52
-#> 2 day 2~H N439 FalsePositiveRate 1.80e-45
-#> 3 day 3~H N342 FalsePositiveRate 2.71e-39
-#> 4 day 2~H N327 FalsePositiveRate 1.06e-35
-#> 5 day 3~H N439 FalsePositiveRate 1.06e-35
-#> 6 day 2~H N477 FalsePositiveRate 1.60e-34
-#> 7 day 3~H N377 FalsePositiveRate 1.60e-34
-#> 8 day 4~H N477 FalsePositiveRate 7.40e-34
-#> 9 day 2~H N447 FalsePositiveRate 6.48e-30
-#> 10 day 3~H N163 FalsePositiveRate 6.48e-30
-#> # … with 241 more rows
A heatmap of these explanatory features can be plotted to show their mean relative intensities across the experiment time points. Here, the classes are also refactored to customise the order of the classes on the x-axis.
-
-refactor_cls <- clsExtract(binary_rf,
- cls = 'day') %>%
- factor(.,levels = c('H','1','2','3','4','5'))
-
-binary_rf <- clsReplace(binary_rf,
- value = refactor_cls,
- cls = 'day')
-binary_rf %>%
- plotExplanatoryHeatmap(metric = 'FalsePositiveRate',
- threshold = 0.05,
- featureNames = TRUE)
Random forest regression can be used to assess the extent of association of the metabolomic data with continuous response variables.
-In this example, the extent of association of injection order with the example data will be assessed.
-
-regression_rf <- d %>%
- randomForest(cls = 'injorder')
-
-print(regression_rf)
-#>
-#> Random forest regression
-#>
-#> Samples: 120
-#> Features: 500
-#> Response: injorder
The regression model performance metrics, based on the OOB prediction error, can be extracted using the following:
-
-regression_rf %>%
- metrics()
-#> # A tibble: 5 × 4
-#> Response .metric .estimator .estimate
-#> <chr> <chr> <chr> <dbl>
-#> 1 injorder rsq standard 0.476
-#> 2 injorder mae standard 23.5
-#> 3 injorder mape standard 154.
-#> 4 injorder rmse standard 26.5
-#> 5 injorder ccc standard 0.508
These regression metrics include R2 (rsq
), mean absolute error (mae
), mean absolute percentage error (mape
), root mean squared error (rmse
) and the concordance correlation coefficient (ccc
).
The R2 and concordance correlation coefficient metrics suggest that there is some association of features with the injection order, although this is weak. This is in agreement with mean absolute error metric that shows that on average, the injection order could only be predicted to an accuracy of 23 injection order positions.
-The MDS plot belows the relative proximities of the samples based on this injection order regression model. This shows that for the most part, there is little correspondence of the sample positions with their injection order. However, there is a small grouping of samples towards the end of the run around sample ~99 to 120. It suggests that there could have been some analytical issues, for certain features, towards the end of the mass spectral analytical run.
-
-regression_rf %>%
- plotMDS(cls = NULL,
- ellipses = FALSE,
- label = 'injorder',
- labelSize = 3)
-#> Warning: ggrepel: 40 unlabeled data points (too many overlaps). Consider
-#> increasing max.overlaps
The available feature importance metrics for this regression model can be listed.
-
-regression_rf %>%
- importanceMetrics()
-#> [1] "%IncMSE" "IncNodePurity"
The feature importance metrics can be plotted to give an overview of their distribution. The following will plot the percentage increase in the mean squared error (%IncMSE
) importance metric.
-regression_rf %>%
- plotImportance(metric = "%IncMSE",
- rank = FALSE)
This shows that there are only a few features that are contributing to the association with injection order. These explanatory features can be extracted with the following, using a threshold of above 5.
-
-regression_rf %>%
- explanatoryFeatures(metric = '%IncMSE',
- threshold = 5)
-#> # A tibble: 7 × 4
-#> Response Feature Metric Value
-#> <chr> <chr> <chr> <dbl>
-#> 1 injorder N283 %IncMSE 19.9
-#> 2 injorder N135 %IncMSE 8.71
-#> 3 injorder N451 %IncMSE 5.58
-#> 4 injorder N161 %IncMSE 5.51
-#> 5 injorder N306 %IncMSE 5.49
-#> 6 injorder N118 %IncMSE 5.22
-#> 7 injorder N297 %IncMSE 5.07
This returned a total of 7 explanatory features above this threshold. The top ranked feature N283
can be plotted to investigate it’s trend in relation to injection order.
-regression_rf %>%
- plotFeature(feature = 'N283',
- cls = 'injorder')
This shows an increase in the intensity of that feature for samples above 100 in the injection order which corresponds with the cluster that was seen in the MDS plot above.
-Univariate methods select features, explanatory for response variables, with features tested on an individual basis. These methods offer simplicity and easy interpretation in their use, however they provide no information as to how features may interact.
-The univariate methods currently available in metabolyseR
include Welch’s t-test, analysis of variance (ANOVA) and linear regression. The following sections will provide brief examples of the use of each of these methods.
Welch’s t-test can be used to select explanatory metabolome features for binary comparisons of discrete variables. By default, all the possible binary comparisons for the categories of a response variable will be tested.
-Below shows the possible binary comparisons for the day
response variable for the example data set.
-binaryComparisons(d,
- cls = 'day')
-#> [1] "1~2" "1~3" "1~4" "1~5" "1~H" "2~3" "2~4" "2~5" "2~H" "3~4" "3~5" "3~H"
-#> [13] "4~5" "4~H" "5~H"
For the following example, only a subset of comparisons will be tested. These will be selected by supplying a list to the comparisons
argument.
-ttest_analysis <- ttest(d,
- cls = 'day',
- comparisons = list(day = c('H~1',
- 'H~2',
- 'H~5')))
-
-print(ttest_analysis)
-#>
-#> Univariate t-test analysis
-#>
-#> Samples: 120
-#> Features: 500
-#> Responses: day
-#> # comparisons: 3
The explanatory features that show a significant difference between the response categories can be extracted as shown below.
-
-explanatoryFeatures(ttest_analysis,
- threshold = 0.05)
-#> # A tibble: 73 × 14
-#> Response Comparison Feature estimate estimate1 estimate2 statistic p.value
-#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 day H~5 N163 -735. 19.5 755. -13.8 1.43e-11
-#> 2 day H~5 N341 2445. 2537. 92.6 13.6 2.88e-11
-#> 3 day H~5 N133 1055. 1077. 21.9 13.0 5.44e-11
-#> 4 day H~2 N341 200. 293. 92.6 10.6 1.38e-10
-#> 5 day H~5 N171 62.6 64.7 2.15 11.9 2.62e-10
-#> 6 day H~5 N119 17.2 17.9 0.763 11.0 8.54e-10
-#> 7 day H~5 N342 243. 247. 4.13 10.8 1.42e- 9
-#> 8 day H~5 N343 27.4 28.3 0.961 9.83 5.99e- 9
-#> 9 day H~5 N377 152. 157. 5.05 9.81 6.75e- 9
-#> 10 day H~5 N477 103. 129. 26.1 9.30 1.05e- 8
-#> # … with 63 more rows, and 6 more variables: parameter <dbl>, conf.low <dbl>,
-#> # conf.high <dbl>, method <chr>, alternative <chr>, adjusted.p.value <dbl>
This will threshold the features based on their adjusted p-value, found in the adjusted.p.value
column of the table. The results of all of the features can be returned using the importance()
method.
A heat map of the explanatory features can be plotted to inspect the relative trends of the explanatory features in relation to the response variable
.
-plotExplanatoryHeatmap(ttest_analysis)
ANOVA can be used to select explanatory features for discrete response variables with 3 or more categories. The following example will compare all the categories in the day
response variable. However, the comparisons
argument can be used to select particular comparisons of interest.
-anova_analysis <- anova(d,
- cls = 'day')
-
-print(anova_analysis)
-#>
-#> Univariate ANOVA analysis
-#>
-#> Samples: 120
-#> Features: 500
-#> Responses: day
-#> # comparisons: 1
The explanatory features that are significantly different between the categories can then be extracted.
-
-explanatoryFeatures(anova_analysis,
- threshold = 0.05)
-#> # A tibble: 110 × 10
-#> Response Comparison Feature term df sumsq meansq statistic p.value
-#> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 day 1~2~3~4~5~H N341 response 5 1.09e8 2.17e7 124. 1.90e-44
-#> 2 day 1~2~3~4~5~H N163 response 5 1.25e7 2.51e6 113. 1.71e-42
-#> 3 day 1~2~3~4~5~H N133 response 5 1.96e7 3.92e6 108. 1.71e-41
-#> 4 day 1~2~3~4~5~H N171 response 5 6.29e4 1.26e4 88.8 1.16e-37
-#> 5 day 1~2~3~4~5~H N342 response 5 1.04e6 2.07e5 85.1 7.61e-37
-#> 6 day 1~2~3~4~5~H N343 response 5 1.19e4 2.38e3 66.1 4.43e-32
-#> 7 day 1~2~3~4~5~H N119 response 5 4.92e3 9.83e2 53.8 2.07e-28
-#> 8 day 1~2~3~4~5~H N497 response 5 1.10e5 2.20e4 49.6 4.83e-27
-#> 9 day 1~2~3~4~5~H N137 response 5 6.32e3 1.26e3 39.9 1.59e-23
-#> 10 day 1~2~3~4~5~H N277 response 5 6.31e4 1.26e4 39.1 3.14e-23
-#> # … with 100 more rows, and 1 more variable: adjusted.p.value <dbl>
The top ranked explanatory feature N341
can be plotted to inspect it’s trend relative to the day
response variable.
-plotFeature(anova_analysis,
- feature = 'N341',
- cls = 'day')
Univariate linear regression can be used to associate a continuous response variable with metabolome features. In the example below, the example data will be regressed against injection order to identify any linearly associated metabolome features.
-
-lr_analysis <- linearRegression(d,
- cls = 'injorder')
-
-print(lr_analysis)
-#>
-#> Univariate linear regression analysis
-#>
-#> Samples: 120
-#> Features: 500
-#> Responses: injorder
The explanatory features can then be extracted.
-
-explanatoryFeatures(lr_analysis)
-#> # A tibble: 8 × 15
-#> Response Feature r.squared adj.r.squared sigma statistic p.value df logLik
-#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 injorder N283 0.310 0.304 4.27 53.0 4.10e-11 1 -343.
-#> 2 injorder N135 0.165 0.157 78.7 23.2 4.31e- 6 1 -693.
-#> 3 injorder N221 0.140 0.133 5.87 19.3 2.50e- 5 1 -382.
-#> 4 injorder N473 0.135 0.127 7.24 18.3 3.78e- 5 1 -407.
-#> 5 injorder N335 0.132 0.124 20.1 17.9 4.59e- 5 1 -529.
-#> 6 injorder N452 0.120 0.112 4.00 16.0 1.10e- 4 1 -335.
-#> 7 injorder N255 0.119 0.111 11.1 15.9 1.17e- 4 1 -458.
-#> 8 injorder N267 0.118 0.111 26.4 15.8 1.22e- 4 1 -562.
-#> # … with 6 more variables: AIC <dbl>, BIC <dbl>, deviance <dbl>,
-#> # df.residual <int>, nobs <int>, adjusted.p.value <dbl>
The top ranked explanatory feature N283
can be plotted to inspect inspects it’s association with injection order.
-plotFeature(lr_analysis,
- feature = 'N283',
- cls = 'injorder')
For routine analyses, the initial analysis parameters for pre-treatment of the data and then the modelling can be selected.
-
-p <- analysisParameters(c('pre-treatment','modelling'))
More specific parameters for pre-treatment of the example data can be declared using the following.
-
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(
- keep = 'classes',
- occupancyFilter = 'maximum',
- transform = 'TICnorm'
- )
-)
The modellingMethods()
function can be used to list the modelling methods that are currently available in metabolyseR
.
-modellingMethods()
-#> [1] "anova" "ttest" "linearRegression" "randomForest"
The modellingParameters()
function can be used to retrieve the default parameters for specific modelling methods. Below, the default modelling parameters for the randomForest
and ttest
methods are specified.
-parameters(p,'modelling') <- modellingParameters(c('randomForest','ttest'))
The class parameters can the be universily specified for both the pre-treatment and modelling elements. For this example, the day
response variable will be used with just the H
and 2
classes.
-changeParameter(p,'cls') <- 'day'
-changeParameter(p,'classes') <- c('H','2')
This gives the following parameters for the analysis.
-
-p
-#> Parameters:
-#> pre-treatment
-#> keep
-#> classes
-#> cls = day
-#> classes = c("H", "2")
-#> occupancyFilter
-#> maximum
-#> cls = day
-#> occupancy = 2/3
-#> transform
-#> TICnorm
-#>
-#> modelling
-#> randomForest
-#> cls = day
-#> rf = list()
-#> reps = 1
-#> binary = FALSE
-#> comparisons = list()
-#> perm = 0
-#> returnModels = FALSE
-#> seed = 1234
-#> ttest
-#> cls = day
-#> pAdjust = bonferroni
-#> comparisons = list()
-#> returnModels = FALSE
The analysis can then be executed.
-<- metabolyse(abr1$neg,abr1$fact,p)
- analysis #>
[34m
-#> metabolyseR
[39m
[31mv0.14.9
[39m Thu Jan 27 12:00:15 2022
-#> ________________________________________________________________________________
-#>
[33m
[33mParameters:
[33m
[39m
-#> pre-treatment
-#> keep
-#> classes
-#> cls = day
-#> classes = c("H", "2")
-#> occupancyFilter
-#> maximum
-#> cls = day
-#> occupancy = 2/3
-#> transform
-#> TICnorm
-#>
-#> modelling
-#> randomForest
-#> cls = day
-#> rf = list()
-#> reps = 1
-#> binary = FALSE
-#> comparisons = list()
-#> perm = 0
-#> returnModels = FALSE
-#> seed = 1234
-#> ttest
-#> cls = day
-#> pAdjust = bonferroni
-#> comparisons = list()
-#> returnModels = FALSE
-#> ________________________________________________________________________________
-#>
[34mPre-treatment
[39m…
-
--treatment
[39m
[32m✓
[39m [4.6S]
-
[34mPre#>
[34mModelling
[39m…
-
-
[34m3.1S]
- Modelling
[39m
[32m✓
[39m [#> ________________________________________________________________________________
-#>
-#>
[32mComplete!
[39m[7.8S]
The results for the modelling can be specifically extracted using the following.
-
-analysisResults(analysis,'modelling')
-#> $randomForest
-#>
-#> Random forest classification
-#>
-#> Samples: 40
-#> Features: 1713
-#> Response: day
-#> # comparisons: 1
-#>
-#>
-#> $ttest
-#>
-#> Univariate t-test analysis
-#>
-#> Samples: 40
-#> Features: 1713
-#> Responses: day
-#> # comparisons: 1
This returns the results as a list containing the modelling results objects for each specified method.
-Alternatively, the modelling results can be assess directly from the Analysis
object. Below shows the extraction of the explanatory features, using default parameters for each method, with the results returned in a single table.
-explanatory_features <- analysis %>%
- explanatoryFeatures()
-
-print(explanatory_features)
-#> # A tibble: 100 × 17
-#> Method Response Comparison Feature Metric Value estimate estimate1
-#> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
-#> 1 randomForest day 2~H N341 FalsePo… 8.06e-28 NA NA
-#> 2 randomForest day 2~H N377 FalsePo… 5.70e-18 NA NA
-#> 3 randomForest day 2~H N447 FalsePo… 5.70e-18 NA NA
-#> 4 randomForest day 2~H N579 FalsePo… 5.70e-18 NA NA
-#> 5 randomForest day 2~H N1084 FalsePo… 1.19e-16 NA NA
-#> 6 randomForest day 2~H N327 FalsePo… 2.33e-15 NA NA
-#> 7 randomForest day 2~H N580 FalsePo… 4.32e-14 NA NA
-#> 8 randomForest day 2~H N1083 FalsePo… 7.49e-13 NA NA
-#> 9 randomForest day 2~H N1085 FalsePo… 7.49e-13 NA NA
-#> 10 randomForest day 2~H N503 FalsePo… 7.49e-13 NA NA
-#> # … with 90 more rows, and 9 more variables: estimate2 <dbl>, statistic <dbl>,
-#> # p.value <dbl>, parameter <dbl>, conf.low <dbl>, conf.high <dbl>,
-#> # method <chr>, alternative <chr>, adjusted.p.value <dbl>
Heat maps of the explanatory features can also be plotted for both the modelling methods.
-
-plotExplanatoryHeatmap(analysis) %>%
- patchwork::wrap_plots()
vignettes/pre_treatment.Rmd
- pre_treatment.Rmd
Metabolomics data from any analytical technique requires various data pre-treatment steps prior to subsequent data mining or other downstream analyses. This aids both the data quality and integrity. It is important that appropriate pre-treatment strategies are used not only for the analytical technique being applied but are also suitable for the statistical or machine learning analyses that are to be utilised. Careful consideration of the pre-treatment steps to be undertaken are required as they can have a substantial influence on the results and inferences taken from metabolomic analyses.
-Data pre-treatment is the most faceted aspect of the analysis elements in metabolyseR. It is itself made up of a number of elements, which themselves are made up of methods. The following document will outline the application of each of these pre-treatment elements for use in exploratory analyses then outline how to apply them in routine analyses. For an introduction to the usage of metabolyseR for both exploratory and routine analyses, see the introduction vignette using:
-
-vignette('introduction','metabolyseR')
To further supplement this document, a quick start example analysis is also available as a vignette:
-
-vignette('quick_start','metabolyseR')
To begin, the package can be loaded using:
-
-library(metabolyseR)
-#>
-#> Attaching package: 'metabolyseR'
-#> The following object is masked from 'package:stats':
-#>
-#> anova
-#> The following objects are masked from 'package:base':
-#>
-#> raw, split
The examples used here will use the abr1
data set from the metaboData package. This is nominal mass flow-injection mass spectrometry (FI-MS) fingerprinting data from a plant-pathogen infection time course experiment. The pipe %>%
from the magrittr package will also be used. The example data can be loaded using:
Only the negative acquisition mode data (abr1$neg
) will be used along with the sample meta-information (abr1$fact
). Create an AnalysisData
class object, assigned to the variable d
, using the following:
-d <- analysisData(abr1$neg,abr1$fact)
-print(d)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 2000
-#> Info: 9
As can be seen above the data set contains a total of 120 samples and 2000 features.
-The package supports parallel processing using the future package.
-By default, processing by metabolyseR
will be done seqentially. However, parallel processing can be activated, prior to analysis, by specifying a parallel implementation using plan()
. The following example specifies using the multisession
implementation (muliple background R sessions) with two worker processes.
-plan(future::multisession,workers = 2)
See the future package documentation for more information on the types of parallel implementations that are available.
-The following sections will outline the numerous pre-treatment elements available within metabolyseR. There will be examples of their application during exploratory analyses along with useful visualisations. These can aid interpretation of when particular treatments should be applied as well as their effect once they have been used.
-In many situations, it will be necessary to exclude either individual samples, sample classes or certain features from further analysis.
-Individual samples can be removed using removeSamples()
as below, where the idx
argument stipulates the sample information column cotaining the sample indexes and the samples
argument a vector of sample indexes to remove.
-d %>%
- removeSamples(idx = 'injorder',samples = 1)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 119
-#> Features: 2000
-#> Info: 9
The removeClasses
function can be used similarly to remove whole classes from further analysis:
-d %>%
- removeClasses(cls = 'day',classes = 'H')
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 100
-#> Features: 2000
-#> Info: 9
The following will enable the removal of specified features as a vector supplied to the features
argument:
-d %>%
- removeFeatures(features = c('N1','N2'))
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 1998
-#> Info: 9
There could be occasions where the numbers of samples, classes or features to remove are greater than the numbers of samples, classes or features that are to be retained. In these situations it will be more convenient to directly specify the samples, classes or features to retain. Keeping samples, classes or features is outlined in the following section.
-Often it will be necessary to retain only particular samples, sample classes or certain features for further analysis.
-Individual samples can be kept using keepSamples()
as below, where the idx
argument stipulates the sample information column cotaining the sample indexes and the samples
argument, a vector of sample indexes to keep.
-d %>%
- keepSamples(idx = 'injorder',samples = 1)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 1
-#> Features: 2000
-#> Info: 9
The keepClasses()
method can be used similarly to keep whole classes for further analysis:
-d %>%
- keepClasses(cls = 'day',classes = 'H')
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 20
-#> Features: 2000
-#> Info: 9
The following will specify features to keep, with a vector of feature names supplied to the features
argument:
-d %>%
- keepFeatures(features = c('N1','N2'))
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 2
-#> Info: 9
There are likely to be occasions where the numbers of samples, classes or features to keep are greater than the numbers of samples, classes or features that are to be excluded. In these situations it will be more convenient to directly specify the samples, classes or features to remove. Removing samples, classes or features is outlined in the previous section.
-Occupancy provides a useful metric by which to filter poorly represented features (features containing a majority zero or missing values). An occupancy threshold provides a means of specifying this majority with variables below the threshold excluded from further analyses. However, this can be complicated by an underlying class structure present within the data where a variable may be well represented within one class but not in another.
-The proportional occupancy for each feature within a data set for a given class structure can be calculated using the occupancy()
method, specifying the sample information column using the cls
argument.
-d %>%
- occupancy(cls = 'day')
-#> # A tibble: 11,914 × 5
-#> day Feature N `Class total` Occupancy
-#> <fct> <chr> <dbl> <int> <dbl>
-#> 1 1 N1 0 20 0
-#> 2 1 N10 0 20 0
-#> 3 1 N100 0 20 0
-#> 4 1 N1000 20 20 1
-#> 5 1 N1001 20 20 1
-#> 6 1 N1002 20 20 1
-#> 7 1 N1003 20 20 1
-#> 8 1 N1004 20 20 1
-#> 9 1 N1005 20 20 1
-#> 10 1 N1006 20 20 1
-#> # … with 11,904 more rows
Alternatively the occupancy distributions can be plotted providing a useful overview of the data set:
-
-d %>%
- plotOccupancy(cls = 'day')
It can be seen that there are a number of unoccupied features across all the sample classes with a small rise in the density distribution near 0.
-There are two strategies for thresholding occupancy. The first is a maximum theshold; where the maximum occupancy across all classes is above the threshold. Therefore, for a feature to be retained, only a single class needs to have an occupancy above the threshold. It is this strategy that will be appropriate for most applications. A two-thirds maximum occupancy filter can be applied to the day
sample information column of our data using:
-maximum_occupancy_filtered <- d %>%
- occupancyMaximum(cls = 'day',occupancy = 2/3)
It can be seen below that this removes 240 features.
-
-print(maximum_occupancy_filtered)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 1760
-#> Info: 9
Plotting the occupancy distributions shows that all the low occupancy features have now been removed.
-
-maximum_occupancy_filtered %>%
- plotOccupancy(cls = 'day')
The alternative strategy is by applying a minimum threshold; where the minimum occupancy across all classes is required to be above the threshold. Therefore, for a feature to be retained, all classes would need to have an occupancy above the threshold. A two-thirds minimum occupancy filter can be applied to the day
sample information column of our data using:
-minimum_occupancy_filtered <- d %>%
- occupancyMinimum(cls = 'day',occupancy = 2/3)
It can be seen below that this removes 344 features.
-
-print(minimum_occupancy_filtered)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 1656
-#> Info: 9
Prior to downstream analyses, metabolomics data often require transformation to fulfill the assumptions of a particular statistical/data mining technique.
-There are a wide range of transformation methods available that are commonly used for the analysis of metabolomics data. These methods are all named with the prefix transform
.
The effects of a transformation on a data set can be assessed using a supervised classifcation approach. The following performs a supervised random forest analysis of the example data and plots the results using both multidimensional scaling (MDS) and reciever operator characteristic (ROC) curves.
-
-d %>%
- plotSupervisedRF(cls = 'day')
Alternatively a log10 transformation can be applied prior to analysis:
-
-d %>%
- transformLog10() %>%
- plotSupervisedRF(cls = 'day')
Or a total ion count (TIC) normalisation where each individual sample is corrected by its TIC. This is one method that can be used to account for small variablility in sample concentration.
-
-d %>%
- transformTICnorm() %>%
- plotSupervisedRF(cls = 'day')
The margin value is a metric that can be used to assess model perfomance. Positive values indicate a models ability, on average, to correctly predict the class labels of the analysed data.
-As can be seen in the plots above, the transformations have little effect on the overall structure of the data set. However, there are small increases in the margins of the transformed data (model improvement). Note that here, a non-parametric machine learning approach has been applied to assess the effects of the transformations on the data. Using a different approach such as the parametric analysis Of variance (ANOVA) which different underlying assumptions will likely give different results to the assessment above.
-Sample aggregation allows the electronic pooling of samples based on a grouping variable. This is useful in situations such as the presence of technical replicates that can be aggregated to reduce the effects of pseudo replication. metabolyseR
provides methods for mean, median and sum aggregation and each starts with the aggregate
prefix.
Below shows a principle component analysis (PCA) plot of the example data coloured by the classes of the day
sample information column. It is first maximum occupancy filtered to remove empty features.
-d %>%
- occupancyMaximum(cls = 'day') %>%
- plotPCA(cls = 'day')
The example below shows the mean aggregation of the data using the experimental classes within the day
sample information column.
-day_mean <- d %>%
- occupancyMaximum(cls = 'day') %>%
- aggregateMean(cls = 'day')
The PCA plot below shows these class averages of the data.
-
-plotPCA(day_mean,cls = 'day',ellipses = FALSE)
There can sometimes be artificial batch related variability introduced into metabolomics analyses as a result of analytical instrumentation or sample preparation. With appropriate sample randomisation (see section on feature filtering based on QC samples), batch related variability can be corrected for using an average centring correction method, applied to the individual features.
-The plot below shows differences in the TIC distributions for each of the classes in the day
sample information column.
The data can then be corrected by class average centring as shown below.
-
-corrected_data <- d %>%
- correctionCenter(block = 'day',type = 'median')
The plot of the TICs below shows that the inter-class variability has been removed but the intra-class variability has been retained.
-
-plotTIC(corrected_data,
- by = 'day',
- colour = 'day')
Missing values can have an important influence on downstream analyses with zero values heavily influencing the outcomes of parametric tests. Where and how they are imputed are important considerations and this is highly related to variable occupancy. The methods provided here allow both these aspects to be taken into account and utilise Random Forest imputation using the missForest package.
-Below shows a Linear Discriminant Analysis (LDA) plot of the example data. The eigenvalue (Tw) gives a comparable indication of the separation between the sample classes.
-
-d %>%
- keepClasses(cls = 'day',classes = c('H','5')) %>%
- occupancyMaximum(cls = 'day',occupancy = 2/3) %>%
- plotLDA(cls = 'day')
The following shows the same, except there is an application of imputation prior to the LDA. The imputed data is based on the data of all the samples present on the data set. It shows a very slight drop in the eigenvalue and therefore reduced separation between the sample classes.
-
-d %>%
- keepClasses(cls = 'day',classes = c('H','5')) %>%
- occupancyMaximum(cls = 'day',occupancy = 2/3) %>%
- imputeAll(parallel = 'variables') %>%
- plotLDA(cls = 'day')
Imputation accuracy is likely to be reduced if data is sparse or there is underlying class structure where there is significant discrimination. Below shows the application imputation prior the LDA, except this time the imputation is class-wise. The imputed data is based only on the values of other samples within the class.
-
-d %>%
- keepClasses(cls = 'day',classes = c('H','5')) %>%
- occupancyMaximum(cls = 'day',occupancy = 2/3) %>%
- imputeClass(cls = 'day') %>%
- plotLDA(cls = 'day')
This shows a slight increase in the eigenvalue with the classes showing greater separation. This is likely due to the increased accuracy of the imputed data relative to the class structure.
-A QC sample is an average pooled sample, equally representative in composition of all the samples present within an experimental set. Within an analytical run, the QC sample is analysed at equal intervals throughout the run. If there is class structure within the run, this should be randomised within a block fashion so that the classes are equally represented in each block throughout the run. A QC sample can then be injected and analysed between these randomised blocks. This provides a set of technical injections that allows the variability in instrument performance over the run to be accounted for and the robustness of the acquired variables to be assessed.
-The technical reproducibility of an acquired variable can be assessed using it’s relative standard deviation (RSD) within the QC samples. The variable RSDs can then be filtered below a threshold value to remove metabolome features that are poorly reproducible across the analytical runs. This variable filtering strategy has an advantage over that of occupancy alone as it is not dependent on underlying class structure. Therefore, the variables and variable numbers will not alter if a new class structure is imposed upon the data.
-The example data set does not include QC samples. For this example, the H
class will be used.
Firstly, the RSD distribution will be assessed for the only H
class. The following retains only the H
class samples to aid visualisation.
-QC <- d %>%
- keepClasses(cls = 'day',classes = 'H')
The table of RSD values for each of the features can be computed as below.
-
-QC %>%
- rsd(cls = 'day')
-#> # A tibble: 2,000 × 5
-#> day Feature Mean SD RSD
-#> <fct> <chr> <dbl> <dbl> <dbl>
-#> 1 H N1 0 0 NaN
-#> 2 H N10 0 0 NaN
-#> 3 H N100 0 0 NaN
-#> 4 H N1000 114. 19.4 17.0
-#> 5 H N1001 99.2 21.6 21.7
-#> 6 H N1002 86.7 23.9 27.6
-#> 7 H N1003 82.3 18.0 21.9
-#> 8 H N1004 91.6 18.8 20.5
-#> 9 H N1005 78.2 14.0 17.9
-#> 10 H N1006 78.6 21.3 27.1
-#> # … with 1,990 more rows
The distributions of the feature RSD values can be plotted for the H
class.
-QC %>%
- plotRSD(cls = 'day')
-#> Warning: Removed 123 rows containing non-finite values (stat_density).
-#> Warning: Removed 1 row(s) containing missing values (geom_path).
This shows that there are a number of features with very high RSD values and therefore poor analytical robustness. Many of these are likely to be as a result of poor occupancy and zero values. Applying an occupancy filter prior to plotting does indeed show a reduction in the upper range of RSD values retained.
-
-QC %>%
- occupancyMaximum(cls = 'day',occupancy = 2/3) %>%
- plotRSD(cls = 'day')
metabolyseR
contains a number of methods for applying pre-treatment routines specifically on QC samples and are all prefixed with QC
. These include methods for feature filtering of a data set based the occupancy of the QC class, imputation of the QC class only, feature filtering based in the RSD values of the QC class and removal of only the QC class.
Below shows an example of applying some of these QC methods. This will first filter the features in the data set based on the occupancy of the QC class. Then the features are filtered based on the RSD values of the QC class using an RSD threshold of 50%. The class index of the QC samples is specified using the QCidx
argument.
-QC_filtered <- d %>%
- QCoccupancy(cls = 'day',QCidx = 'H',occupancy = 2/3) %>%
- QCrsdFilter(cls = 'day',QCidx = 'H',RSDthresh = 50)
This removes a total of 637 features.
-
-print(QC_filtered)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 1363
-#> Info: 9
For routine analyses, the available pre-treatment elements can retreived using:
-
-preTreatmentElements()
-#> [1] "aggregate" "correction" "impute" "keep"
-#> [5] "occupancyFilter" "QC" "remove" "transform"
The available methods for a specified pre-treatment element can be viewed using:
-
-preTreatmentMethods('remove')
-#> [1] "classes" "features" "samples"
The default pre-treatment parameters can first be assigned to the variable p
.
-p <- analysisParameters('pre-treatment')
The preTreatmentParameters()
function allows the parameters for particular pre-treatment elements to be specified. The following specifies the pre-treatment elements that will be used for this data set. These will include the keeping of certain sample classes, the filtering of features based on class occupancy and the application of a TIC normalisation. These will be assigned to the p
variable using the parameters()
method.
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(
- keep = 'classes',
- occupancyFilter = 'maximum',
- transform = 'TICnorm'
- )
-)
Printing p
shows these pre-treatment steps.
-print(p)
-#> Parameters:
-#> pre-treatment
-#> keep
-#> classes
-#> cls = class
-#> classes = c()
-#> occupancyFilter
-#> maximum
-#> cls = class
-#> occupancy = 2/3
-#> transform
-#> TICnorm
Next, the day
sample information column can be specified, along with the classes to be kept which will be the H
, the 1
and the 2
classes.
-changeParameter(p,'cls') <- 'day'
-changeParameter(p,'classes') <- c('H','1','2')
Printing p
shows the final pre-treatment parameters that will be used for this analysis.
-print(p)
-#> Parameters:
-#> pre-treatment
-#> keep
-#> classes
-#> cls = day
-#> classes = c("H", "1", "2")
-#> occupancyFilter
-#> maximum
-#> cls = day
-#> occupancy = 2/3
-#> transform
-#> TICnorm
The pre-treatment routine can then be executed.
-<- metabolyse(abr1$neg,abr1$fact,p)
- analysis #>
[34m
-#> metabolyseR
[39m
[31mv0.14.9
[39m Thu Jan 27 12:03:18 2022
-#> ________________________________________________________________________________
-#>
[33m
[33mParameters:
[33m
[39m
-#> pre-treatment
-#> keep
-#> classes
-#> cls = day
-#> classes = c("H", "1", "2")
-#> occupancyFilter
-#> maximum
-#> cls = day
-#> occupancy = 2/3
-#> transform
-#> TICnorm
-#> ________________________________________________________________________________
-#>
[34mPre-treatment
[39m…
-
--treatment
[39m
[32m✓
[39m [6.5S]
-
[34mPre#> ________________________________________________________________________________
-#>
-#>
[32mComplete!
[39m[6.5S]
Printing the analysis
object shows the resulting data from the pre-treatment routine.
-print(analysis)
-#>
-#> metabolyseR v0.14.9
-#> Analysis:
-#> Thu Jan 27 12:03:18 2022
-#>
-#> Raw Data:
-#> No. samples = 120
-#> No. features = 2000
-#>
-#> Pre-treated Data:
-#> Thu Jan 27 12:03:24 2022
-#> No. samples = 60
-#> No. features = 1723
The pre-treated data can be extracted from the Analysis
object using several methods.
Firstly the analysisResults()
method.
-analysisResults(analysis,'pre-treatment')
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 60
-#> Features: 1723
-#> Info: 9
And secondly the preTreated()
method.
-preTreated(analysis)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 60
-#> Features: 1723
-#> Info: 9
A supervised random forest analysis can be used to visualise the structure of the resulting pre-treated data.
-
-analysis %>%
- plotSupervisedRF(cls = 'day',type = 'pre-treated')
vignettes/quick_start.Rmd
- quick_start.Rmd
This example analysis will use the abr1
data set from the metaboData package. It is nominal mass flow-injection mass spectrometry (FI-MS) fingerprinting data from a plant-pathogen infection time course experiment. The analysis will also include use of the pipe %>%
from the magrittr package. First load the necessary packages.
For this example we will use only the negative acquisition mode data (abr1$neg
) and sample meta-information (abr1$fact
). Create an AnalysisData
class object using the following:
-d <- analysisData(abr1$neg,abr1$fact)
The data includes 120 samples and 2000 mass spectral features as shown below.
-
-d
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 2000
-#> Info: 9
The clsAvailable()
function can be used to identify the columns available in our meta-information table.
-clsAvailable(d)
-#> [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name" "rep"
-#> [8] "day" "class"
For this analysis, we will be using the infection time course class information contained in the day
column. This can be extracted and the class frequencies tabulated using the following:
-d %>%
- clsExtract(cls = 'day') %>%
- table()
-#> .
-#> 1 2 3 4 5 H
-#> 20 20 20 20 20 20
As can be seen above, the experiment is made up of six infection time point classes that includes a healthy control class (H
) and five day infection time points (1-5
), each with 20 replicates.
For data pre-treatment prior to statistical analysis, a two-thirds maximum class occupancy filter can be applied. Features where the maximum proportion of non-missing data per class is above two-thirds are retained. A total ion count normalisation will also be applied.
-
-d <- d %>%
- occupancyMaximum(cls = 'day', occupancy = 2/3) %>%
- transformTICnorm()
-d
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 1760
-#> Info: 9
This has reduced the data set to 1760 relevant features.
-The structure of the data can be visualised using both unsupervised and supervised methods. For instance, the first two principle components from a principle component analysis (PCA) of the data with the sample points coloured by infection class can be plotted using:
-
-plotPCA(d,cls = 'day',xAxis = 'PC1',yAxis = 'PC2')
And similarly, multidimensional scaling (MDS) of sample proximity values from a supervised random forest classification model along with receiver operator characteristic (ROC) curves.
-
-plotSupervisedRF(d,cls = 'day')
A progression can clearly be seen from the earliest to latest infected time points.
-For feature selection, one-way analysis of variance (ANOVA) can be performed for each feature to identify features significantly explanatory for the infection time point.
- -A table of the significantly explanatory features can be extracted with a bonferroni correction adjusted p value < 0.05 using:
-
-explan_feat <- explanatoryFeatures(anova_results,threshold = 0.05)
-explan_feat
-#> # A tibble: 379 × 10
-#> Response Comparison Feature term df sumsq meansq statistic p.value
-#> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 day 1~2~3~4~5~H N341 respon… 5 3.88e-4 7.76e-5 137. 1.55e-46
-#> 2 day 1~2~3~4~5~H N133 respon… 5 7.00e-5 1.40e-5 126. 8.63e-45
-#> 3 day 1~2~3~4~5~H N163 respon… 5 6.01e-5 1.20e-5 117. 2.95e-43
-#> 4 day 1~2~3~4~5~H N1087 respon… 5 2.42e-6 4.84e-7 99.8 5.61e-40
-#> 5 day 1~2~3~4~5~H N171 respon… 5 2.25e-7 4.50e-8 95.7 3.84e-39
-#> 6 day 1~2~3~4~5~H N513 respon… 5 3.38e-6 6.76e-7 95.3 4.78e-39
-#> 7 day 1~2~3~4~5~H N1025 respon… 5 2.78e-6 5.56e-7 91.0 3.91e-38
-#> 8 day 1~2~3~4~5~H N342 respon… 5 3.71e-6 7.41e-7 90.3 5.32e-38
-#> 9 day 1~2~3~4~5~H N1083 respon… 5 5.11e-5 1.02e-5 89.0 1.06e-37
-#> 10 day 1~2~3~4~5~H N1085 respon… 5 1.10e-5 2.19e-6 83.4 1.92e-36
-#> # … with 369 more rows, and 1 more variable: adjusted.p.value <dbl>
The ANOVA has identified 379 features significantly explanatory over the infection time course. A heat map of the mean relative intensity for each class of these explanatory features can be plotted to visualise their trends between the infection time point classes.
-
-plotExplanatoryHeatmap(anova_results,
- threshold = 0.05,
- featureNames = FALSE)
Many of the explanatory features can be seen to be most highly abundant in the final infection time point 5
.
Finally, box plots of the trends of individual features can be plotted, such as the N341
feature below.
-plotFeature(anova_results,feature = 'N341',cls = 'day')
--A tool kit for pre-treatment, modelling, feature selection and correlation analyses of metabolomics data.
-
This package provides a tool kit of methods for metabolomics analyses that includes:
-The metabolyseR
package can be installed from GitHub using the following:
-devtools::install_github('jasenfinch/metabolyseR',build_vignettes = TRUE)
The package documentation can be browsed online at https://jasenfinch.github.io/metabolyseR/.
-If this is your first time using metabolyseR
see the Introduction vignette or the quick start analysis below for information on how to get started.
If you believe you’ve found a bug in metabolyseR
, please file a bug (and, if possible, a reproducible example) at https://github.com/jasenfinch/metabolyseR/issues.
This example analysis will use the abr1
data set from the metaboData package. It is nominal mass flow-injection mass spectrometry (FI-MS) fingerprinting data from a plant-pathogen infection time course experiment. The analysis will also include use of the pipe %>%
from the magrittr package. First load the necessary packages.
For this example we will use only the negative acquisition mode data (abr1$neg
) and sample meta-information (abr1$fact
). Create an AnalysisData
class object using the following:
-d <- analysisData(abr1$neg,abr1$fact)
The data includes 120 samples and 2000 mass spectral features as shown below.
-
-d
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 2000
-#> Info: 9
The clsAvailable()
function can be used to identify the columns available in our meta-information table.
-clsAvailable(d)
-#> [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name" "rep"
-#> [8] "day" "class"
For this analysis, we will be using the infection time course class information contained in the day
column. This can be extracted and the class frequencies tabulated using the following:
-d %>%
- clsExtract(cls = 'day') %>%
- table()
-#> .
-#> 1 2 3 4 5 H
-#> 20 20 20 20 20 20
As can be seen above, the experiment is made up of six infection time point classes that includes a healthy control class (H
) and five day infection time points (1-5
), each with 20 replicates.
For data pre-treatment prior to statistical analysis, a two-thirds maximum class occupancy filter can be applied. Features where the maximum proportion of non-missing data per class is above two-thirds are retained. A total ion count normalisation will also be applied.
-
-d <- d %>%
- occupancyMaximum(cls = 'day', occupancy = 2/3) %>%
- transformTICnorm()
-d
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 1760
-#> Info: 9
This has reduced the data set to 1760 relevant features.
-The structure of the data can be visualised using both unsupervised and supervised methods. For instance, the first two principle components from a principle component analysis (PCA) of the data with the sample points coloured by infection class can be plotted using:
-
-plotPCA(d,cls = 'day',xAxis = 'PC1',yAxis = 'PC2')
And similarly, multidimensional scaling (MDS) of sample proximity values from a supervised random forest classification model along with receiver operator characteristic (ROC) curves.
-
-plotSupervisedRF(d,cls = 'day')
A progression can clearly be seen from the earliest to latest infected time points.
-For feature selection, one-way analysis of variance (ANOVA) can be performed for each feature to identify features significantly explanatory for the infection time point.
- -A table of the significantly explanatory features can be extracted with a bonferroni correction adjusted p value < 0.05 using:
-
-explan_feat <- explanatoryFeatures(anova_results,threshold = 0.05)
-explan_feat
-#> # A tibble: 379 × 10
-#> Response Comparison Feature term df sumsq meansq statistic p.value
-#> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 day 1~2~3~4~5~H N341 respo… 5 3.88e-4 7.76e-5 137. 1.55e-46
-#> 2 day 1~2~3~4~5~H N133 respo… 5 7.00e-5 1.40e-5 126. 8.63e-45
-#> 3 day 1~2~3~4~5~H N163 respo… 5 6.01e-5 1.20e-5 117. 2.95e-43
-#> 4 day 1~2~3~4~5~H N1087 respo… 5 2.42e-6 4.84e-7 99.8 5.61e-40
-#> 5 day 1~2~3~4~5~H N171 respo… 5 2.25e-7 4.50e-8 95.7 3.84e-39
-#> 6 day 1~2~3~4~5~H N513 respo… 5 3.38e-6 6.76e-7 95.3 4.78e-39
-#> 7 day 1~2~3~4~5~H N1025 respo… 5 2.78e-6 5.56e-7 91.0 3.91e-38
-#> 8 day 1~2~3~4~5~H N342 respo… 5 3.71e-6 7.41e-7 90.3 5.32e-38
-#> 9 day 1~2~3~4~5~H N1083 respo… 5 5.11e-5 1.02e-5 89.0 1.06e-37
-#> 10 day 1~2~3~4~5~H N1085 respo… 5 1.10e-5 2.19e-6 83.4 1.92e-36
-#> # … with 369 more rows, and 1 more variable: adjusted.p.value <dbl>
The ANOVA has identified 379 features significantly explanatory over the infection time course. A heat map of the mean relative intensity for each class of these explanatory features can be plotted to visualise their trends between the infection time point classes.
-
-plotExplanatoryHeatmap(anova_results,
- threshold = 0.05,
- featureNames = FALSE)
Many of the explanatory features can be seen to be most highly abundant in the final infection time point 5
.
Finally, box plots of the trends of individual features can be plotted, such as the N341
feature below.
-plotFeature(anova_results,feature = 'N341',cls = 'day')
NEWS.md
- Suppressed name repair console message encountered during random forest permutation testing.
Added the proximity()
method for extracting sample proximities from the RandomForest
S4 class.
Added the mds()
method to perform multidimensional scaling on sample proximities from the RandomForest
S4 class.
Added the roc()
method to calculate receiver-operator characteristic curves from the RandomForest
S4 class.
An error is now thrown during random forest classification when less than two classes are specified.
plotSupervisedRF()
now skips plotting if errors are encountered during random forest training.
plotLDA()
.plotExplanatoryHeatmap()
method for the Analysis
class now returns the plot only if the number of plots is equal to 1.
Removed reference to the nCores
parameter from the documentation example of metabolyse()
.
imputeAll()
now suppressed.Temporarily added jasenfinch/missForest as a remote until stekhoven/missForest pull request #25 is resolved.
The limit of the number of plotted features in plotExplanatoryHeatmap
can now be specified using the featureLimit
argument.
plotExplanatoryHeatmap()
now returns NULL and returns a message when no explanatory features are found.
Fixed the alignment of the dendrogram branches with heat map rows in plotExplanatoryHeatmap()
.
Fixed ggplot2::guides()
warning in plotFeature()
and plotTIC()
.
Fixed bug in explanatoryFeatures()
methods for Analysis
class and lists where the threshold was not applied.
Corrected the text in the modelling vignette concerning the results of using unsupervised random forest for outlier detection.
Package version, creation date and verbose argument added to prototype of Analysis
class.
All generics are now defined as standard generics.
Added metrics
method for Analysis
class.
metrics
method for lists now ignores list elements that are not of class RandomForest
.
RSDthresh
argument default to 50% instead of 0.5% in QCrsdFilter
generic.Added a NEWS.md
file to track changes to the package.
pkgdown
site now available at https://jasenfinch.github.io/metabolyseR/.
Bug reports and issues URL at https://github.com/jasenfinch/metabolyseR/issues added to package DESCRIPTION.
Dedicated vignettes now available for a quick start example analysis, data pre-treatment and data modelling.
Function examples added to all documentation pages.
Unit test coverage increased to > 95%.
Parallel processing is now implemented using the future
package.
RandomForest
and Univariate
classes now inherit from class the AnalysisData
class.
Improvements to plot theme aesthetics.
type
argument added to plotPCA()
, plotLDA()
, plotUnsupervisedRF()
and plotSupervisedRF()
methods for the Analysis
class.
"pre-treated"
for specifying type argument in Analysis
class methods now used over "preTreated"
Added clsRename()
method for renaming class information columns.
plotMeasures()
method renamed to plotMetrics()
.
Added plotMDS()
, plotImportance()
and plotMetrics()
methods for lists of RandomForest
class objects.
Added plotExplanatoryHeatmap()
method for lists of RandomForest
or Univariate
class objects.
Renamed keepVariables()
and removeVariables()
methods to keepFeatures()
and removeFeatures()
.
Added the helper functions preTreatmentElements()
, preTreatmentMethods()
and preTreatParameters()
for declaring pre-treatment parameters for the AnalysisParameters
class.
Added the helper functions modellingMethods()
and modellingParameters()
for declaring modelling parameters for the AnalysisParameters
class.
Added helper function correlationsParameters()
for declaring correlations parameters for the AnalysisParameters
class.
Added binaryComparisons()
method for retrieving all possible binary class comparisons from an AnalysisData
class object.
changeParameter()
now assigns parameter values through direct assignment.
Added analysisResults()
method from extracting analysis elements results from the Analysis
class.
Added exportParameters()
method for exporting analysis parameters to YAML file format.
Added dat()
and sinfo()
accessor methods for the Analysis
class.
Relative standard deviation (RSD) values are now specified and returned as percentages.
An S4 class to store analysis results.
-log
list containing analysis dates and time
parameters
class AnalysisParameters containing the analysis parameters
raw
list containing info and raw data
pre-treated
list containing preTreated info and raw data
modelling
list containing modelling results
correlations
tibble containing weighted edgelist of correlations
An S4 class for metabolomic data and sample meta information.
-data
sample metabolomic data
info
sample meta information
An S4 class to store analysis parameters.
-pre-treatment
list containing parameters for data pre-treatment
modelling
list containing parameters for modelling
correlations
list containing parameters for correlations
Quality control (QC) sample pre-treatment methods.
-QCimpute(
- d,
- cls = "class",
- QCidx = "QC",
- occupancy = 2/3,
- parallel = "variables",
- seed = 1234
-)
-
-# S4 method for AnalysisData
-QCimpute(
- d,
- cls = "class",
- QCidx = "QC",
- occupancy = 2/3,
- parallel = "variables",
- seed = 1234
-)
-
-QCoccupancy(d, cls = "class", QCidx = "QC", occupancy = 2/3)
-
-# S4 method for AnalysisData
-QCoccupancy(d, cls = "class", QCidx = "QC", occupancy = 2/3)
-
-QCremove(d, cls = "class", QCidx = "QC")
-
-# S4 method for AnalysisData
-QCremove(d, cls = "class", QCidx = "QC")
-
-QCrsdFilter(d, cls = "class", QCidx = "QC", RSDthresh = 50)
-
-# S4 method for AnalysisData
-QCrsdFilter(d, cls = "class", QCidx = "QC", RSDthresh = 50)
S4 object of class AnalysisData
info column to use for class labels
QC sample label
occupancy threshold for filtering
parallel type to use. See ?missForest
for details
random number seed
RSD (%) threshold for filtering
An S4 object of class AnalysisData
containing QC treated data.
A QC sample is an average pooled sample, equally representative in composition of all the samples present within an experimental set. -Within an analytical run, the QC sample is analysed at equal intervals throughout the run. -If there is class structure within the run, this should be randomised within a block fashion so that the classes are equally represented in each block throughout the run. -A QC sample can then be injected and analysed between these randomised blocks. -This provides a set of technical injections that allows the variability in instrument performance over the run to be accounted for and the robustness of the acquired variables to be assessed.
-The technical reproducibility of an acquired variable can be assessed using it's relative standard deviation (RSD) within the QC samples. -The variable RSDs can then be filtered below a threshold value to remove metabolome features that are poorly reproducible across the analytical runs. -This variable filtering strategy has an advantage over that of occupancy alone as it is not dependent on underlying class structure. -Therefore, the variables and variable numbers will not alter if a new class structure is imposed upon the data.
-QCimpute
: Missing value imputation of QC samples.
QCoccupancy
: Feature maximum occupancy filtering based on QC samples.
QCremove
: Remove QC samples.
QCrsdFilter
: Feature filtering based RSD of QC sample features.
-## Initial example data preparation
-library(metaboData)
-d <- analysisData(abr1$neg[,1:1000],abr1$fact)
-
-## Plot the feature RSD distributions of the H class only
-d %>%
- keepClasses(cls = 'day',classes = 'H') %>%
- plotRSD(cls = 'day')
-#> Warning: Removed 119 rows containing non-finite values (stat_density).
-#> Warning: Removed 1 row(s) containing missing values (geom_path).
-
-
-## Apply QC feature occupancy filtering and QC feature RSD filtering
-QC_treated <- d %>%
- QCoccupancy(cls = 'day',QCidx = 'H',occupancy = 2/3) %>%
- QCrsdFilter(cls = 'day',QCidx = 'H',RSDthresh = 50)
-
-print(QC_treated)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 404
-#> Info: 9
-#>
-
-## Plot the feature RSD distributions of the H class after QC treatments
-QC_treated %>%
- keepClasses(cls = 'day',classes = 'H') %>%
- plotRSD(cls = 'day')
-
-
QC imputation.
-QCimpute( - d, - cls = "class", - QCidx = "QC", - occupancy = 2/3, - parallel = "variables", - seed = 1234 -) - -# S4 method for AnalysisData -QCimpute( - d, - cls = "class", - QCidx = "QC", - occupancy = 2/3, - parallel = "variables", - seed = 1234 -)- -
d | -S4 object of class AnalysisData |
-
---|---|
cls | -info column to use for class labels |
-
QCidx | -QC sample label |
-
occupancy | -occupancy threshold for imputation |
-
parallel | -parallel type to use. See |
-
seed | -random number seed |
-
QC maximum occupancy filter.
-QCoccupancy(d, cls = "class", QCidx = "QC", occupancy = 2/3) - -# S4 method for AnalysisData -QCoccupancy(d, cls = "class", QCidx = "QC", occupancy = 2/3)- -
d | -S4 object of class AnalysisData |
-
---|---|
cls | -info column to use for class labels |
-
QCidx | -QC sample label |
-
occupancy | -occupancy threshold for filtering |
-
QC relative standard deviation (RSD) filtering..
-QCrsdFilter(d, cls = "class", QCidx = "QC", RSDthresh = 0.5) - -# S4 method for AnalysisData -QCrsdFilter(d, cls = "class", QCidx = "QC", RSDthresh = 50)- -
d | -S4 object of class AnalysisData |
-
---|---|
cls | -info column to use for class labels |
-
QCidx | -QC sample label |
-
RSDthresh | -RSD (%) threshold for filtering |
-
An S4 class for random forest results and models.
-type
random forest type
response
response variable name
results
list of measure and importance results tables
predictions
tibble of model observation predictions
permutations
list of permutations measure and importance results tables
importances
tibble of model feature importances
proximities
tibble of model observation proximities
models
list of random forest models
An S4 class for univariate test models and results.
-type
univariate test type
models
list of model objects
results
tibble containing test results
Aggregation of sample features based on a grouping variable.
-aggregateMean(d, cls = "class")
-
-# S4 method for AnalysisData
-aggregateMean(d, cls = "class")
-
-aggregateMedian(d, cls = "class")
-
-# S4 method for AnalysisData
-aggregateMedian(d, cls = "class")
-
-aggregateSum(d, cls = "class")
-
-# S4 method for AnalysisData
-aggregateSum(d, cls = "class")
S4 object of class AnalysisData
info column to use for class data
An S4 object of class AnalysisData
containing the aggregated data.
Sample aggregation allows the electronic pooling of sample features based on a grouping variable. -This is useful in situations such as the presence of technical replicates that can be aggregated to reduce the effects of pseudo replication.
-aggregateMean
: Aggregate sample features to the group mean.
aggregateMedian
: Aggregate sample features to the group median.
aggregateSum
: Aggregate sample features to the group total.
## Each of the following examples shows the application of the aggregation method and then
-## a Principle Component Analysis is plotted to show it's effect on the data structure.
-
-## Initial example data preparation
-library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
- occupancyMaximum(occupancy = 2/3)
-
-d %>%
- plotPCA(cls = 'day')
-
-
-## Mean aggregation
-d %>%
- aggregateMean(cls = 'day') %>%
- plotPCA(cls = 'day',ellipses = FALSE)
-
-
-## Median aggregation
-d %>%
- aggregateMedian(cls = 'day') %>%
- plotPCA(cls = 'day',ellipses = FALSE)
-
-
-## Sum aggregation
-d %>%
- aggregateSum(cls = 'day') %>%
- plotPCA(cls = 'day',ellipses = FALSE)
-
-
AnalysisData
and Analysis
class accessorsR/analysis-accessors.R
- analysis-accessors.Rd
Accessor methods for the AnalysisData
and Analysis
S4 classes.
dat(x, ...)
-
-# S4 method for AnalysisData
-dat(x)
-
-# S4 method for Analysis
-dat(x, type = c("raw", "pre-treated"))
-
-dat(x, ...) <- value
-
-# S4 method for AnalysisData
-dat(x) <- value
-
-# S4 method for Analysis
-dat(x, type = c("raw", "pre-treated")) <- value
-
-sinfo(x, ...)
-
-# S4 method for AnalysisData
-sinfo(x)
-
-# S4 method for Analysis
-sinfo(x, type = c("raw", "pre-treated"), value)
-
-sinfo(x, ...) <- value
-
-# S4 method for AnalysisData
-sinfo(x) <- value
-
-# S4 method for Analysis
-sinfo(x, type = c("raw", "pre-treated")) <- value
-
-raw(x)
-
-# S4 method for Analysis
-raw(x)
-
-raw(x) <- value
-
-# S4 method for Analysis
-raw(x) <- value
-
-preTreated(x)
-
-# S4 method for Analysis
-preTreated(x)
-
-preTreated(x) <- value
-
-# S4 method for Analysis
-preTreated(x) <- value
-
-features(x, ...)
-
-# S4 method for AnalysisData
-features(x)
-
-# S4 method for Analysis
-features(x, type = c("raw", "pre-treated"))
-
-nSamples(x, ...)
-
-# S4 method for AnalysisData
-nSamples(x)
-
-# S4 method for Analysis
-nSamples(x, type = c("raw", "pre-treated"))
-
-nFeatures(x, ...)
-
-# S4 method for AnalysisData
-nFeatures(x)
-
-# S4 method for Analysis
-nFeatures(x, type = c("raw", "pre-treated"))
-
-analysisResults(x, element)
-
-# S4 method for Analysis
-analysisResults(x, element)
S4 object of class AnalysisData
or Analysis
arguments to pass to the appropriate method
get or set raw
or pre-treated
data
value to set
analysis element results to return
dat
: Return a metabolomic data table.
dat<-
: Set a metabolomic data table.
sinfo
: Return a sample information data table.
sinfo<-
: Set a sample information data table.
raw
: Return the AnalysisData
object containing unprocessed metabolomic data from an Analysis
object.
raw<-
: Set an AnalysisData
object to the raw
slot of an Analysis
class object.
preTreated
: Return the AnalysisData
object containing pre-treated metabolomic data from an Analysis
object.
preTreated<-
: Set an AnalysisData
object to the pre-treated
slot of an Analysis
class object.
features
: Return the features names.
nSamples
: Return the number of samples.
nFeatures
: Return the number of features.
analysisResults
: Return results from an Analysis
object of an analysis element.
library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
-## Return the metabolomic data
-dat(d)
-#> # A tibble: 120 × 101
-#> N200 N201 N202 N203 N204 N205 N206 N207 N208 N209 N210 N211
-#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 0 0 0 2.98 0 0 0 0.468 0 1.43 0 0.170
-#> 2 0 0 0 1.30 0 1.15 0 0 0 0.492 0 0
-#> 3 0 0 0 6.08 0.214 2.53 0 1.85 0 1.06 0.184 0.0827
-#> 4 0 4.24 0 1.48 0 0 0.147 0 0 0.929 0 0.286
-#> 5 0 0 0 0.530 0 0.233 0.376 1.41 0 0.274 0 0.139
-#> 6 0 0 0 0 0 0.438 0 0 0.219 0.325 0 0
-#> 7 0 0 0 0.547 0 0 0 0 0 0 0 0
-#> 8 0 0 0.195 1.37 0.594 1.11 0.0902 0 0 0 0.162 0
-#> 9 0 0 0 1.24 0 0.196 0.675 0.528 0.128 2.61 0.294 2.66
-#> 10 0 0 0 0.113 0 1.06 0 0 0 1.76 2.96 0
-#> # … with 110 more rows, and 89 more variables: N212 <dbl>, N213 <dbl>,
-#> # N214 <dbl>, N215 <dbl>, N216 <dbl>, N217 <dbl>, N218 <dbl>, N219 <dbl>,
-#> # N220 <dbl>, N221 <dbl>, N222 <dbl>, N223 <dbl>, N224 <dbl>, N225 <dbl>,
-#> # N226 <dbl>, N227 <dbl>, N228 <dbl>, N229 <dbl>, N230 <dbl>, N231 <dbl>,
-#> # N232 <dbl>, N233 <dbl>, N234 <dbl>, N235 <dbl>, N236 <dbl>, N237 <dbl>,
-#> # N238 <dbl>, N239 <dbl>, N240 <dbl>, N241 <dbl>, N242 <dbl>, N243 <dbl>,
-#> # N244 <dbl>, N245 <dbl>, N246 <dbl>, N247 <dbl>, N248 <dbl>, N249 <dbl>, …
-
-## Set the metabolomic data
-dat(d) <- abr1$neg[,300:400]
-
-## Return the sample information
-sinfo(d)
-#> # A tibble: 120 × 9
-#> injorder pathcdf filecdf name.org remark name rep day class
-#> <int> <fct> <fct> <fct> <fct> <fct> <int> <fct> <int>
-#> 1 1 C:/Xcalibur/ANDI-LT… 01.cdf 12_2 ok 12_2 2 2 2
-#> 2 2 C:/Xcalibur/ANDI-LT… 02.cdf 13_3 ok 13_4 3 3 3
-#> 3 3 C:/Xcalibur/ANDI-LT… 03.cdf 15_4 ok 15_5 5 4 4
-#> 4 4 C:/Xcalibur/ANDI-LT… 04.cdf 12_1 ok 12_2 2 1 1
-#> 5 5 C:/Xcalibur/ANDI-LT… 05.cdf 12_2 ok 12_2 2 2 2
-#> 6 6 C:/Xcalibur/ANDI-LT… 06.cdf 11_1 ok 11_2 1 1 1
-#> 7 7 C:/Xcalibur/ANDI-LT… 07.cdf 14_2 ok 14_3 4 2 2
-#> 8 8 C:/Xcalibur/ANDI-LT… 08.cdf 11_4 ok 11_5 1 4 4
-#> 9 9 C:/Xcalibur/ANDI-LT… 09.cdf 13_H ok 13_H 3 H 6
-#> 10 10 C:/Xcalibur/ANDI-LT… 10.cdf 15_H ok 15_H 5 H 6
-#> # … with 110 more rows
-
-## Set the sample information
-sinfo(d) <- abr1$fact
-
-## Return the feature names
-features(d)
-#> [1] "N300" "N301" "N302" "N303" "N304" "N305" "N306" "N307" "N308" "N309"
-#> [11] "N310" "N311" "N312" "N313" "N314" "N315" "N316" "N317" "N318" "N319"
-#> [21] "N320" "N321" "N322" "N323" "N324" "N325" "N326" "N327" "N328" "N329"
-#> [31] "N330" "N331" "N332" "N333" "N334" "N335" "N336" "N337" "N338" "N339"
-#> [41] "N340" "N341" "N342" "N343" "N344" "N345" "N346" "N347" "N348" "N349"
-#> [51] "N350" "N351" "N352" "N353" "N354" "N355" "N356" "N357" "N358" "N359"
-#> [61] "N360" "N361" "N362" "N363" "N364" "N365" "N366" "N367" "N368" "N369"
-#> [71] "N370" "N371" "N372" "N373" "N374" "N375" "N376" "N377" "N378" "N379"
-#> [81] "N380" "N381" "N382" "N383" "N384" "N385" "N386" "N387" "N388" "N389"
-#> [91] "N390" "N391" "N392" "N393" "N394" "N395" "N396" "N397" "N398" "N399"
-#> [101] "N400"
-
-## Return the number of samples
-nSamples(d)
-#> [1] 120
-
-## Return the number of features
-nFeatures(d)
-#> [1] 101
-
Create an AnalysisData S4 object.
-analysisData(data, info)
table containing sample metabolomic data
table containing sample meta information
An S4 object of class Analysis.
-library(metaboData)
-d <- analysisData(data = abr1$neg,info = abr1$fact)
-
-print(d)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 2000
-#> Info: 9
-#>
-
Return the analysis elements available in metabolyseR
.
analysisElements()
A character vector of analysis elements.
-analysisElements()
-#> [1] "pre-treatment" "modelling" "correlations"
-
Initiate an AnalysisParameters
object with the default analysis parameters for each of the analysis elements.
analysisParameters(elements = analysisElements())
character vector containing elements for analysis.
An S4 object of class AnalysisParameters
containing the default analysis parameters.
p <- analysisParameters()
-
-print(p)
-#> Parameters:
-#> pre-treatment
-#> QC
-#> occupancyFilter
-#> cls = class
-#> QCidx = QC
-#> occupancy = 2/3
-#> impute
-#> cls = class
-#> QCidx = QC
-#> occupancy = 2/3
-#> parallel = variables
-#> seed = 1234
-#> RSDfilter
-#> cls = class
-#> QCidx = QC
-#> RSDthresh = 50
-#> removeQC
-#> cls = class
-#> QCidx = QC
-#> occupancyFilter
-#> maximum
-#> cls = class
-#> occupancy = 2/3
-#> impute
-#> class
-#> cls = class
-#> occupancy = 2/3
-#> seed = 1234
-#> transform
-#> TICnorm
-#>
-#> modelling
-#> randomForest
-#> cls = class
-#> rf = list()
-#> reps = 1
-#> binary = FALSE
-#> comparisons = list()
-#> perm = 0
-#> returnModels = FALSE
-#> seed = 1234
-#>
-#> correlations
-#> method = pearson
-#> pAdjustMethod = bonferroni
-#> corPvalue = 0.05
-
Extract analysis results for a given analysis element.
-analysisResults(x, element) - -# S4 method for Analysis -analysisResults(x, element)- -
x | -S4 object of class Analysis |
-
---|---|
element | -Analysis element to extract.
-Should be one of those returned |
-
One-way analysis of variance (ANOVA).
-S4 object of class AnalysisData
a vector of sample info column names to analyse
p value adjustment method
list of comparisons to perform
should models be returned
library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
-## Perform ANOVA
-anova_analysis <- anova(d,cls = 'day')
-
-## Extract significant features
-explanatoryFeatures(anova_analysis)
-#> # A tibble: 21 × 10
-#> Response Comparison Feature term df sumsq meansq statistic p.value
-#> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 day 1~2~3~4~5~H N277 response 5 63072. 12614. 39.1 3.14e-23
-#> 2 day 1~2~3~4~5~H N229 response 5 43549. 8710. 18.1 3.54e-13
-#> 3 day 1~2~3~4~5~H N299 response 5 1211. 242. 16.4 3.87e-12
-#> 4 day 1~2~3~4~5~H N295 response 5 271. 54.2 13.6 2.02e-10
-#> 5 day 1~2~3~4~5~H N281 response 5 192. 38.5 12.5 1.16e- 9
-#> 6 day 1~2~3~4~5~H N245 response 5 6268. 1254. 11.6 4.38e- 9
-#> 7 day 1~2~3~4~5~H N255 response 5 5363. 1073. 11.0 1.14e- 8
-#> 8 day 1~2~3~4~5~H N278 response 5 277. 55.4 10.9 1.48e- 8
-#> 9 day 1~2~3~4~5~H N259 response 5 1236. 247. 10.8 1.72e- 8
-#> 10 day 1~2~3~4~5~H N279 response 5 810. 162. 10.5 2.77e- 8
-#> # … with 11 more rows, and 1 more variable: adjusted.p.value <dbl>
-
Return a vector of possible binary comparisons for a -given sample information column.
-binaryComparisons(x, cls = "class") - -# S4 method for AnalysisData -binaryComparisons(x, cls = "class")- -
x | -S4 object of class AnalysisData. |
-
---|---|
cls | -sample information column to use |
-
Bind the rows of AnalysisData objects contained within a list.
-bindRows(d)
-
-# S4 method for list
-bindRows(d)
list object containing S4 objects of class AnalysisData to be bound
An S4 object of class AnalysisData containg the bound data sets.
-library(metaboData)
-d <- list(
- negative = analysisData(abr1$neg,abr1$fact),
- positive = analysisData(abr1$pos,abr1$fact)
- )
-
-bindRows(d)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 240
-#> Features: 4000
-#> Info: 9
-#>
-
Change analysis parameters.
-changeParameter(x, parameterName, elements = analysisElements()) <- value
-
-# S4 method for AnalysisParameters
-changeParameter(x, parameterName, elements = analysisElements()) <- value
S4 object of class AnalysisParameters
name of the parameter to change
character vector of analysis elements to target parameter
-change. Can be any returned by analysisElements()
.
New value of the parameter
An S4 object of class AnalysisParameters
.
For the parameter name selected, all parameters with that name will -be altered.
-p <- analysisParameters('pre-treatment')
-
-changeParameter(p,'cls') <- 'day'
-
-print(p)
-#> Parameters:
-#> pre-treatment
-#> QC
-#> occupancyFilter
-#> cls = day
-#> QCidx = QC
-#> occupancy = 2/3
-#> impute
-#> cls = day
-#> QCidx = QC
-#> occupancy = 2/3
-#> parallel = variables
-#> seed = 1234
-#> RSDfilter
-#> cls = day
-#> QCidx = QC
-#> RSDthresh = 50
-#> removeQC
-#> cls = day
-#> QCidx = QC
-#> occupancyFilter
-#> maximum
-#> cls = day
-#> occupancy = 2/3
-#> impute
-#> class
-#> cls = day
-#> occupancy = 2/3
-#> seed = 1234
-#> transform
-#> TICnorm
-#>
-
Query or alter sample meta information in AnalysisData
or Analysis
class objects.
Replace a given sample info column from an Analysis or -AnalysisData object.
-clsAdd(d, cls, value, ...)
-
-# S4 method for AnalysisData
-clsAdd(d, cls, value)
-
-# S4 method for Analysis
-clsAdd(d, cls, value, type = c("raw", "pre-treated"))
-
-clsArrange(d, cls = "class", descending = FALSE, ...)
-
-# S4 method for AnalysisData
-clsArrange(d, cls = "class", descending = FALSE)
-
-# S4 method for Analysis
-clsArrange(
- d,
- cls = "class",
- descending = FALSE,
- type = c("raw", "pre-treated")
-)
-
-clsAvailable(d, ...)
-
-# S4 method for AnalysisData
-clsAvailable(d)
-
-# S4 method for Analysis
-clsAvailable(d, type = c("raw", "pre-treated"))
-
-clsExtract(d, cls = "class", ...)
-
-# S4 method for AnalysisData
-clsExtract(d, cls = "class")
-
-# S4 method for Analysis
-clsExtract(d, cls = "class", type = c("raw", "pre-treated"))
-
-clsRemove(d, cls, ...)
-
-# S4 method for AnalysisData
-clsRemove(d, cls)
-
-# S4 method for Analysis
-clsRemove(d, cls, type = c("raw", "pre-treated"))
-
-clsRename(d, cls, newName, ...)
-
-# S4 method for AnalysisData
-clsRename(d, cls, newName)
-
-# S4 method for Analysis
-clsRename(d, cls, newName, type = c("raw", "pre-treated"))
-
-clsReplace(d, value, cls = "class", ...)
-
-# S4 method for AnalysisData
-clsReplace(d, value, cls = "class")
-
-# S4 method for Analysis
-clsReplace(d, value, cls = "class", type = c("raw", "pre-treated"))
S4 object of class Analysis or AnalysisData
sample info column to extract
vactor of new sample information for replacement
arguments to pass to specific method
raw
or pre-treated
sample information
TRUE/FALSE, arrange samples in descending order
new column name
clsAdd
: Add a sample information column.
clsArrange
: Arrange sample row order by a specified sample information column.
clsAvailable
: Retrieve the names of the available sample information columns.
clsExtract
: Extract the values of a specified sample information column.
clsRemove
: Remove a sample information column.
clsRename
: Rename a sample information column.
clsReplace
: Replace a sample information column.
library(metaboData)
-d <- analysisData(abr1$neg,abr1$fact)
-
-## Add a sample information column named 'new'
-d <- clsAdd(d,'new',1:nSamples(d))
-
-print(d)
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 2000
-#> Info: 10
-#>
-
-## Arrange the row orders by the 'day' column
-d <- clsArrange(d,'day')
-
-clsExtract(d,'day')
-#> [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
-#> [38] 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4
-#> [75] 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 H H H H H H H H H H H
-#> [112] H H H H H H H H H
-#> Levels: 1 2 3 4 5 H
-
-## Retreive the available sample information column names
-clsAvailable(d)
-#> [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name"
-#> [7] "rep" "day" "class" "new"
-
-## Extract the values of the 'day' column
-clsExtract(d,'day')
-#> [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
-#> [38] 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4
-#> [75] 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 H H H H H H H H H H H
-#> [112] H H H H H H H H H
-#> Levels: 1 2 3 4 5 H
-
-## Remove the 'class' column
-d <- clsRemove(d,'class')
-
-clsAvailable(d)
-#> [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name" "rep"
-#> [8] "day" "new"
-
-## Rename the 'day' column to 'treatment'
-d <- clsRename(d,'day','treatment')
-
-clsAvailable(d)
-#> [1] "injorder" "pathcdf" "filecdf" "name.org" "remark" "name"
-#> [7] "rep" "treatment" "new"
-
-## Replace the values of the 'treatment' column
-d <- clsReplace(d,rep(1,nSamples(d)),'treatment')
-
-clsExtract(d,'treatment')
-#> [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-#> [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-#> [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-#> [112] 1 1 1 1 1 1 1 1 1
-
Add a sample info column to a Analysis or -AnalysisData object.
-clsAdd(x, cls, value, ...) - -# S4 method for AnalysisData -clsAdd(x, cls, value) - -# S4 method for Analysis -clsAdd(x, cls, value, type = "raw")- -
x | -S4 object of class Analysis or AnalysisData |
-
---|---|
cls | -name of new sample information column |
-
value | -new sample information to add |
-
... | -arguments to pass to specific method |
-
type | -
|
-
Order samples within an object of class AnalysisData or -Analysis by a given sample information column.
-clsArrange(x, cls = "class", descending = FALSE, ...) - -# S4 method for AnalysisData -clsArrange(x, cls = "class", descending = FALSE) - -# S4 method for Analysis -clsArrange(x, cls = "class", descending = FALSE, type = "raw")- -
x | -S4 object of class AnalysisData or Analysis |
-
---|---|
cls | -name of sample information column to arrange by |
-
descending | -TRUE/FALSE, arrange samples in descending order |
-
... | -arguments to pass to specific method |
-
type | -
|
-
Return available sample info columns from an Analysis or -AnalysisData object.
-clsAvailable(x, ...) - -# S4 method for AnalysisData -clsAvailable(x) - -# S4 method for Analysis -clsAvailable(x, type = "raw")- -
x | -S4 object of class Analysis or AnalysisData |
-
---|---|
... | -arguments to pass to specific method |
-
type | -
|
-
Extract a given sample info column from an Analysis or -AnalysisData object.
-clsExtract(x, cls = "class", ...) - -# S4 method for AnalysisData -clsExtract(x, cls = "class") - -# S4 method for Analysis -clsExtract(x, cls = "class", type = "raw")- -
x | -S4 object of class Analysis or AnalysisData |
-
---|---|
cls | -sample info column to extract |
-
... | -arguments to pass to specific method |
-
type | -
|
-
Remove a sample info column from a Analysis or -AnalysisData object.
-clsRemove(x, cls, ...) - -# S4 method for AnalysisData -clsRemove(x, cls) - -# S4 method for Analysis -clsRemove(x, cls, type = "raw")- -
x | -S4 object of class Analysis or AnalysisData |
-
---|---|
cls | -name of sample information column to remove |
-
... | -arguments to pass to specific method |
-
type | -
|
-
Rename a sample information column within an object of -AnalysisData or Analysis.
-clsRename(x, cls, newName, ...) - -# S4 method for AnalysisData -clsRename(x, cls, newName) - -# S4 method for Analysis -clsRename(x, cls, newName, type = "raw")- -
x | -S4 object of class Analysis or AnalysisData |
-
---|---|
cls | -sample information column to rename |
-
newName | -new column name |
-
... | -arguments to pass to specific method |
-
type | -
|
-
Replace a given sample info column from an Analysis or -AnalysisData object.
-clsReplace(x, value, cls = "class", ...) - -# S4 method for AnalysisData -clsReplace(x, value, cls = "class") - -# S4 method for Analysis -clsReplace(x, value, cls = "class", type = "raw")- -
x | -S4 object of class Analysis or AnalysisData |
-
---|---|
value | -vactor of new sample information for replacement |
-
cls | -sample info column to replace |
-
... | -arguments to pass to specific method |
-
type | -
|
-
Correction of batch/block differences.
-S4 object of class AnalysisData
sample information column name to use containing sample block -groupings
type of average to use
An S4 object of class AnalysisData
containing the corrected data.
There can sometimes be artificial batch related variability introduced into metabolomics analyses as a result of analytical instrumentation or sample preparation. -With an appropriate randomised block design of sample injection order, batch related variability can be corrected using an average centring correction method of the individual features.
-correctionCenter
: Correction using group average centring.
-## Initial example data preparation
-library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
- occupancyMaximum(occupancy = 2/3)
-
-## Group total ion count distributions prior to correction
-d %>%
- plotTIC(by = 'day',colour = 'day')
-
-
-## Group total ion count distributions after group median correction
-d %>%
- correctionCenter(block = 'day',type = 'median') %>%
- plotTIC(by = 'day',colour = 'day')
-
-
Batch correction using average centering.
-# S4 method for AnalysisData -correctionCenter(d, block = "block", type = c("mean", "median"))- - - -
Feature correlation analysis.
-correlations(d, ...)
-
-# S4 method for AnalysisData
-correlations(
- d,
- method = "pearson",
- pAdjustMethod = "bonferroni",
- corPvalue = 0.05
-)
-
-# S4 method for Analysis
-correlations(d)
S4 object of class AnalysisData
arguments to pass to specific method
correlation method. One of pearson
or spearman
.
p-value adjustment method. See ?p.adjust
for available methods.
p-value cut-off threshold for significance
A tibble containing results of significantly correlated features.
-Correlation analyses can be used to identify associated features within data sets. -This can be useful to identifying clusters of related features that can be used to annotate metabolites within data sets. -All features are compared and the returned table of correlations are p-value thresholded using the specified cut-off.
-library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
-correlations(d)
-#> # A tibble: 130 × 7
-#> Feature1 Feature2 log2IntensityRatio r `|r|` p n
-#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <int>
-#> 1 N212 N227 -0.884 0.980 0.980 0.0107 7
-#> 2 N224 N286 1.85 0.971 0.971 0.00612 8
-#> 3 N215 N276 0.227 0.965 0.965 0.0419 7
-#> 4 N224 N265 0.576 0.943 0.943 0.00138 11
-#> 5 N201 N275 -1.59 0.909 0.909 0.0264 10
-#> 6 N213 N231 -1.63 0.883 0.883 0 108
-#> 7 N224 N225 -0.792 0.863 0.863 0.000000176 29
-#> 8 N258 N263 -2.89 0.857 0.857 0.0181 13
-#> 9 N267 N297 -0.671 0.853 0.853 0 120
-#> 10 N211 N291 -1.55 0.831 0.831 0.00106 19
-#> # … with 120 more rows
-
Retrieve the default parameters for correlation analysis.
-correlationsParameters()
## Retrieve the default correlation parameters
-p <- correlationsParameters()
-
-## Assign the correlation parameters to analysis parameters
-cp <- analysisParameters('correlations')
-parameters(cp,'correlations') <- p
-
-print(cp)
-#> Parameters:
-#> correlations
-#> method = pearson
-#> pAdjustMethod = bonferroni
-#> corPvalue = 0.05
-
Return or set sample data in an -AnalysisData or Analysis objects.
-dat(x, ...) - -dat(x, ...) <- value - -# S4 method for AnalysisData -dat(x) - -# S4 method for AnalysisData -dat(x) <- value - -# S4 method for Analysis -dat(x, type = "pre-treated") - -# S4 method for Analysis -dat(x, type = "pre-treated") <- value- -
x | -S4 object of class AnalysisData or Analysis |
-
---|---|
... | -arguments to pass to the appropriate method |
-
value | -tibble containing sample data |
-
type | -data type to extract or set. -Should be one of "raw" or "pre-treated" |
-
Extract explanatory features from modelling results.
-explanatoryFeatures(x, ...) - -# S4 method for Univariate -explanatoryFeatures(x, threshold = 0.05, ...) - -# S4 method for RandomForest -explanatoryFeatures(x, metric = "FalsePositiveRate", threshold = 0.05) - -# S4 method for list -explanatoryFeatures(x, threshold = 0.05, ...) - -# S4 method for Analysis -explanatoryFeatures(x, threshold = 0.05, ...)- -
x | -S4 object of class RandomForest or Univariate |
-
---|---|
... | -arguments to parse to method for specific class |
-
threshold | -threshold below which explanatory features are extracted |
-
metric | -importance metric on which to retrieve explanatory feautres |
-
Export analysis parameters from AnalysisParameters or -Analysis objects to YAML format.
-exportParameters(x, file = "analysis_parameters.yaml") - -# S4 method for AnalysisParameters -exportParameters(x, file = "analysis_parameters.yaml") - -# S4 method for Analysis -exportParameters(x, file = "analysis_parameters.yaml")- -
x | -S4 object of class AnalysisParameters or Analysis |
-
---|---|
file | -File name and path to export to |
-
Return a vector of the feature names.
-features(x, ...) - -# S4 method for AnalysisData -features(x) - -# S4 method for Analysis -features(x, type = "raw")- -
x | -S4 object of class AnalysisData or Analysis |
-
---|---|
... | -arguments to pass to the appropriate method |
-
type | -return features from "raw" or "pre-treated" data |
-
return feature importance resutls from a RandomForest -or Univariate classes
-importance(x) - -# S4 method for RandomForest -importance(x) - -# S4 method for Univariate -importance(x) - -# S4 method for list -importance(x) - -# S4 method for Analysis -importance(x)- -
x | -S4 object of class RandomForest or Univariate |
-
---|
Impute missing values using random forest imputation.
-imputeAll(d, occupancy = 2/3, parallel = "variables", seed = 1234)
-
-# S4 method for AnalysisData
-imputeAll(d, occupancy = 2/3, parallel = "variables", seed = 1234)
-
-imputeClass(d, cls = "class", occupancy = 2/3, seed = 1234)
-
-# S4 method for AnalysisData
-imputeClass(d, cls = "class", occupancy = 2/3, seed = 1234)
S4 object of class AnalysisData
occupancy threshold above which missing values of a feature will be imputed
parallel type to use. See ?missForest
for details
random number seed
info column to use for class labels
An S4 object of class AnalysisData
containing the data after imputation.
Missing values can have an important influence on downstream analyses with zero values heavily influencing the outcomes of parametric tests.
-Where and how they are imputed are important considerations and is highly related to variable occupancy.
-The methods provided here allow both these aspects to be taken into account and utilise random forest imputation using the missForest
package.
imputeAll
: Impute missing values across all sample features.
imputeClass
: Impute missing values class-wise.
## Each of the following examples shows the application of each imputation method and then
-## a Linear Discriminant Analysis is plotted to show it's effect on the data structure.
-
-## Initial example data preparation
-library(metaboData)
-
-d <- analysisData(abr1$neg[,200:250],abr1$fact) %>%
- occupancyMaximum(occupancy = 2/3)
-
-d %>%
- plotLDA(cls = 'day')
-
-
-## Missing value imputation across all samples
-d %>%
- imputeAll(parallel = 'no') %>%
- plotLDA(cls = 'day')
-
-
-## Missing value imputation class-wise
-d %>%
- imputeClass(cls = 'day') %>%
- plotLDA(cls = 'day')
-
-
Impute missing values across all samples using Random Forest.
-imputeAll(d, occupancy = 2/3, parallel = "variables", seed = 1234) - -# S4 method for AnalysisData -imputeAll(d, occupancy = 2/3, parallel = "variables", seed = 1234)- -
d | -S4 object of class AnalysisData |
-
---|---|
occupancy | -occupancy threshold for imputation of a given feature |
-
parallel | -parallel type to use. See |
-
seed | -random number seed |
-
Impute missing values class-wise using Random Forest.
-imputeClass(d, cls = "class", occupancy = 2/3, seed = 1234) - -# S4 method for AnalysisData -imputeClass(d, cls = "class", occupancy = 2/3, seed = 1234)- -
d | -S4 object of class AnalysisData |
-
---|---|
cls | -info column to use for class labels |
-
occupancy | -occupancy threshold for imputation |
-
seed | -random number seed |
-
- Analysis S4 classes- - |
- |
---|---|
- - | -Analysis S4 class |
-
- - | -AnalysisData S4 class |
-
- - | -Perform an analysis |
-
- - | -AnalysisData class constructor |
-
-
|
-
|
-
-
|
- Sample meta information wrangling |
-
- Analysis parameters- - |
- |
- - | -AnalysisParameters S4 class |
-
- - | -Analysis elements |
-
- - | -Create an |
-
- - | -Get or set analysis parameters |
-
- - | -Change analysis parameters |
-
- - | -Parse/export analysis parameters |
-
-
|
- Pre-treatment parameters |
-
- - | -Modelling parameters |
-
- - | -Correlations parameters |
-
- Pre-treatment- - |
- |
- - | -Sample aggregation |
-
- - | -Batch/block correction |
-
- - | -Missing data imputation |
-
- - | -Keep samples, classes or features |
-
- - | -Feature occupancy filtering |
-
- - | -Quality control (QC) sample treatments |
-
- - | -Remove samples, classes or features |
-
-
|
- Scaling, transformation and normalisation methods |
-
- Modelling- - |
- |
- - | -RandomForest S4 class |
-
- - | -Univariate S4 class |
-
- - | -Random forest analysis |
-
- - | -ANOVA |
-
- - | -Welch's t-test |
-
- - | -Linear regression |
-
-
|
- Modelling accessor methods |
-
- - | -Multidimensional scaling (MDS) |
-
- - | -Receiver-operator characteristic (ROC) curves |
-
- Correlations- - |
- |
- - | -Feature correlation analysis |
-
- Plotting- - |
- |
- - | -Plot a feature |
-
- - | -Plot class occupancy distributions |
-
- - | -Plot RSD distributions |
-
- - | -Plot sample total ion counts |
-
- - | -Principle Component Analysis plot |
-
- - | -Principle Component - Linear Discriminant Analysis plot |
-
- - | -Unsupervised random forest MDS plot |
-
- - | -Supervised random forest MDS plot |
-
- - | -Multidimensional scaling (MDS) plot |
-
- - | -Plot receiver operator characteristic (ROC) curves |
-
- - | -Plot model performance metrics |
-
- - | -Plot feature importance |
-
- - | -Heatmap plot of explantory features |
-
- Miscellaneous- - |
- |
- - | -Bind |
-
- - | -Split an |
-
- - | -Calculate feature relative standard deviations |
-
- - | -Calculate feature class occupancies |
-
Import analysis parameters from a .yaml
format file or export an AnalysisParameters
object to .yaml
format.
parseParameters(path)
-
-exportParameters(d, file = "analysis_parameters.yaml")
-
-# S4 method for AnalysisParameters
-exportParameters(d, file = "analysis_parameters.yaml")
-
-# S4 method for Analysis
-exportParameters(d, file = "analysis_parameters.yaml")
file path of .yaml file to parse
S4 object of class AnalysisParameters or Analysis
File name and path to export to
## Import analysis parameters
-paramFile <- system.file('defaultParameters.yaml',package = 'metabolyseR')
-p <- parseParameters(paramFile)
-p
-#> Parameters:
-#> pre-treatment
-#> QC
-#> occupancyFilter
-#> cls = class
-#> QCidx = QC
-#> occupancy = 0.667
-#> impute
-#> cls = class
-#> QCidx = QC
-#> occupancy = 0.667
-#> RSDfilter
-#> cls = class
-#> QCidx = QC
-#> RSDthresh = 0.5
-#> removeQC
-#> cls = class
-#> QCidx = QC
-#> occupancyFilter
-#> maximum
-#> cls = class
-#> occupancy = 0.667
-#> impute
-#> class
-#> cls = class
-#> occupancy = 0.667
-#> transform
-#> TICnorm
-#>
-#> correlations
-#> method = pearson
-#> pAdjustMethod = bonferroni
-#> corPvalue = 0.05
-
-if (FALSE) {
-## Export analysis parameters
-exportParameters(p,file = 'analysis_parameters.yaml')
-}
-
Retain samples, classes or features in an AnalysisData
object.
keepClasses(d, cls = "class", classes = c())
-
-# S4 method for AnalysisData
-keepClasses(d, cls = "class", classes = c())
-
-keepFeatures(d, features = character())
-
-# S4 method for AnalysisData
-keepFeatures(d, features = character())
-
-keepSamples(d, idx = "fileOrder", samples = c())
-
-# S4 method for AnalysisData
-keepSamples(d, idx = "fileOrder", samples = c())
S4 object of class AnalysisData
info column to use for class information
classes to keep
features to remove
info column containing sample indexes
sample indexes to keep
An S4 object of class AnalysisData
with specified samples, classes or features retained.
keepClasses
: Keep classes.
keepFeatures
: Keep features.
keepSamples
: Keep samples.
library(metaboData)
- d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
- ## Keep classes
- d %>%
- keepClasses(cls = 'day',classes = 'H')
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 20
-#> Features: 101
-#> Info: 9
-#>
-
- ## Keep features
- d %>%
- keepFeatures(features = c('N200','N201'))
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 2
-#> Info: 9
-#>
-
- ## Keep samples
- d %>%
- keepSamples(idx = 'injorder',samples = c(1,10))
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 2
-#> Features: 101
-#> Info: 9
-#>
-
Keep classes from an AnalysisData object.
-keepClasses(d, cls = "class", classes = c()) - -# S4 method for AnalysisData -keepClasses(d, cls = "class", classes = c())- -
d | -S4 object of class AnalysisData |
-
---|---|
cls | -info column to use for class information |
-
classes | -classes to keep |
-
Keep samples from an AnalysisData object.
-keepSamples(d, idx = "fileOrder", samples = c()) - -# S4 method for AnalysisData -keepSamples(d, idx = "fileOrder", samples = c())- -
d | -S4 object of class AnalysisData |
-
---|---|
idx | -info column containing sample indexes |
-
samples | -sample indexes to keep |
-
Linear regression
-linearRegression(
- x,
- cls = "class",
- pAdjust = "bonferroni",
- returnModels = FALSE
-)
-
-# S4 method for AnalysisData
-linearRegression(
- x,
- cls = "class",
- pAdjust = "bonferroni",
- returnModels = FALSE
-)
S4 object of class AnalysisData
vector of sample information column names to regress
p value adjustment method
should models be returned
An S4 object of class Univariate
.
library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
-## Perform linear regression
-lr_analysis <- linearRegression(d,cls = 'injorder')
-
-## Extract significant features
-explanatoryFeatures(lr_analysis)
-#> # A tibble: 5 × 15
-#> Response Feature r.squared adj.r.squared sigma statistic p.value df logLik
-#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 injorder N283 0.310 0.304 4.27 53.0 4.10e-11 1 -343.
-#> 2 injorder N221 0.140 0.133 5.87 19.3 2.50e- 5 1 -382.
-#> 3 injorder N255 0.119 0.111 11.1 15.9 1.17e- 4 1 -458.
-#> 4 injorder N267 0.118 0.111 26.4 15.8 1.22e- 4 1 -562.
-#> 5 injorder N297 0.107 0.0995 44.7 14.1 2.65e- 4 1 -625.
-#> # … with 6 more variables: AIC <dbl>, BIC <dbl>, deviance <dbl>,
-#> # df.residual <int>, nobs <int>, adjusted.p.value <dbl>
-
Multidimensional scaling of random forest proximities.
-mds(x, dimensions = 2, idx = NULL)
-
-# S4 method for RandomForest
-mds(x, dimensions = 2, idx = NULL)
-
-# S4 method for list
-mds(x, dimensions = 2, idx = NULL)
-
-# S4 method for Analysis
-mds(x, dimensions = 2, idx = NULL)
S4 object of class RandomForest
, Analysis
or a list
The number of dimensions by which the data are to be represented.
sample information column to use for sample names. If NULL
, the sample row number will be used. Sample names should be unique for each row of data.
A tibble containing the scaled dimensions.
-library(metaboData)
-
-x <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
- occupancyMaximum(cls = 'day') %>%
- transformTICnorm()
-
-rf <- randomForest(x,cls = 'day')
-
-mds(rf)
-#> # A tibble: 120 × 5
-#> Response Comparison Sample `Dimension 1` `Dimension 2`
-#> <chr> <chr> <dbl> <dbl> <dbl>
-#> 1 day 1~2~3~4~5~H 1 -0.0129 -0.190
-#> 2 day 1~2~3~4~5~H 2 -0.101 -0.254
-#> 3 day 1~2~3~4~5~H 3 -0.0156 0.173
-#> 4 day 1~2~3~4~5~H 4 -0.0896 0.147
-#> 5 day 1~2~3~4~5~H 5 0.146 -0.0566
-#> 6 day 1~2~3~4~5~H 6 -0.132 0.0946
-#> 7 day 1~2~3~4~5~H 7 -0.0862 -0.195
-#> 8 day 1~2~3~4~5~H 8 0.144 -0.0917
-#> 9 day 1~2~3~4~5~H 9 0.0408 -0.110
-#> 10 day 1~2~3~4~5~H 10 -0.146 0.155
-#> # … with 110 more rows
-
Perform analyses containing multiple analysis element steps.
-metabolyse(data, info, parameters = analysisParameters(), verbose = TRUE)
-
-reAnalyse(analysis, parameters = analysisParameters(), verbose = TRUE)
-
-# S4 method for Analysis
-reAnalyse(analysis, parameters = analysisParameters(), verbose = TRUE)
tibble or data.frame containing data to analyse
tibble or data.frame containing data info or meta data
an object of AnalysisParameters class containing
-parameters for analysis. Default calls analysisParameters()
should output be printed to the console
an object of class Analysis containing previous -analysis results
An S4 object of class Analysis
.
Routine analyses are those that are often made up of numerous steps where parameters have likely already been previously established.
-The emphasis here is on convenience with as little code as possible required.
-In these analyses, the necessary analysis elements, order and parameters are first prepared and then the analysis routine subsequently performed in a single step.
-The metabolyse
function provides this utility, where the metabolome data, sample meta information and analysis parameters are provided.
-The reAnalyse
method can be used to perform further analyses on the results.
library(metaboData)
-
-## Generate analysis parameters
-p <- analysisParameters(c('pre-treatment','modelling'))
-
-## Alter pre-treatment and modelling parameters to use different methods
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(occupancyFilter = 'maximum',
- transform = 'TICnorm')
-)
-parameters(p,'modelling') <- modellingParameters('anova')
-
-## Change "cls" parameters
-changeParameter(p,'cls') <- 'day'
-
-## Run analysis using a subset of the abr1 negative mode data set
-analysis <- metabolyse(abr1$neg[,1:200],
- abr1$fact,
- p)
-#>
-#> metabolyseR v0.14.9 Thu Jan 27 11:58:32 2022
-#> ________________________________________________________________________________
-#> Parameters:
-#> pre-treatment
-#> occupancyFilter
-#> maximum
-#> cls = day
-#> occupancy = 2/3
-#> transform
-#> TICnorm
-#>
-#> modelling
-#> anova
-#> cls = day
-#> pAdjust = bonferroni
-#> comparisons = list()
-#> returnModels = FALSE
-#> ________________________________________________________________________________
-#> Pre-treatment …
-#>
Pre-treatment ✔ [0.6S]
-#> Modelling …
-#>
Modelling ✔ [0.7S]
-#> ________________________________________________________________________________
-#>
-#> Complete! [1.3S]
-
-## Re-analyse to include correlation analysis
-analysis <- reAnalyse(analysis,
- parameters = analysisParameters('correlations'))
-#>
-#> metabolyseR v0.14.9 Thu Jan 27 11:58:33 2022
-#> ________________________________________________________________________________
-#> Parameters:
-#> correlations
-#> method = pearson
-#> pAdjustMethod = bonferroni
-#> corPvalue = 0.05
-#> ________________________________________________________________________________
-#>
-#> Correlations …
-#>
Correlations ✔ [0.1S]
-#> ________________________________________________________________________________
-#>
-#> Complete! [0.1S]
-#>
-
-print(analysis)
-#>
-#> metabolyseR v0.14.9
-#> Analysis:
-#> Thu Jan 27 11:58:32 2022
-#>
-#> Raw Data:
-#> No. samples = 120
-#> No. features = 200
-#>
-#> Pre-treated Data:
-#> Thu Jan 27 11:58:33 2022
-#> No. samples = 120
-#> No. features = 48
-#>
-#> Modelling:
-#> Thu Jan 27 11:58:33 2022
-#> Methods: anova
-#>
-#> Correlations:
-#> Thu Jan 27 11:58:33 2022
-#> No. correlations = 140
-
-
Methods for accessing modelling results.
-binaryComparisons(x, cls = "class")
-
-# S4 method for AnalysisData
-binaryComparisons(x, cls = "class")
-
-type(x)
-
-# S4 method for RandomForest
-type(x)
-
-response(x)
-
-# S4 method for RandomForest
-response(x)
-
-metrics(x)
-
-# S4 method for RandomForest
-metrics(x)
-
-# S4 method for list
-metrics(x)
-
-# S4 method for Analysis
-metrics(x)
-
-importanceMetrics(x)
-
-# S4 method for RandomForest
-importanceMetrics(x)
-
-importance(x)
-
-# S4 method for RandomForest
-importance(x)
-
-# S4 method for Univariate
-importance(x)
-
-# S4 method for list
-importance(x)
-
-# S4 method for Analysis
-importance(x)
-
-proximity(x, idx = NULL)
-
-# S4 method for RandomForest
-proximity(x, idx = NULL)
-
-# S4 method for list
-proximity(x, idx = NULL)
-
-# S4 method for Analysis
-proximity(x, idx = NULL)
-
-explanatoryFeatures(x, ...)
-
-# S4 method for Univariate
-explanatoryFeatures(x, threshold = 0.05)
-
-# S4 method for RandomForest
-explanatoryFeatures(x, metric = "FalsePositiveRate", threshold = 0.05)
-
-# S4 method for list
-explanatoryFeatures(x, ...)
-
-# S4 method for Analysis
-explanatoryFeatures(x, ...)
S4 object of class AnalysisData
,RandomForest
, Univariate
, Analysis
or a list.
sample information column to use
sample information column to use for sample names. If NULL
, the sample row number will be used. Sample names should be unique for each row of data.
arguments to parse to method for specific class
threshold below which explanatory features are extracted
importance metric for which to retrieve explanatory features
binaryComparisons
: Return a vector of all possible binary comparisons for a given sample information column.
type
: Return the type of random forest analysis.
response
: Return the response variable name used for a random forest analysis.
metrics
: Retrieve the model performance metrics for a random forest analysis
importanceMetrics
: Retrieve the available feature importance metrics for a random forest analysis.
importance
: Retrieve feature importance results.
proximity
: Retrieve the random forest sample proximities.
explanatoryFeatures
: Retrieve explanatory features.
library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
-## Return possible binary comparisons for the 'day' column
-binaryComparisons(d,cls = 'day')
-#> [1] "1~2" "1~3" "1~4" "1~5" "1~H" "2~3" "2~4" "2~5" "2~H" "3~4" "3~5" "3~H"
-#> [13] "4~5" "4~H" "5~H"
-
-## Perform random forest analysis
-rf_analysis <- randomForest(d,cls = 'day')
-
-## Return the type of random forest
-type(rf_analysis)
-#> [1] "classification"
-
-## Return the response variable name used
-response(rf_analysis)
-#> [1] "day"
-
-## Retrieve the model performance metrics
-metrics(rf_analysis)
-#> # A tibble: 4 × 5
-#> Response Comparison .metric .estimator .estimate
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 day 1~2~3~4~5~H accuracy multiclass 0.567
-#> 2 day 1~2~3~4~5~H kap multiclass 0.48
-#> 3 day 1~2~3~4~5~H roc_auc hand_till 0.886
-#> 4 day 1~2~3~4~5~H margin NA 0.0424
-
-## Show the available feature importance metrics
-importanceMetrics(rf_analysis)
-#> [1] "1" "2" "3"
-#> [4] "4" "5" "FalsePositiveRate"
-#> [7] "H" "MeanDecreaseAccuracy" "MeanDecreaseGini"
-#> [10] "SelectionFrequency"
-
-## Retrieve the feature importance results
-importance(rf_analysis)
-#> # A tibble: 1,010 × 5
-#> Response Comparison Feature Metric Value
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 day 1~2~3~4~5~H N200 1 0
-#> 2 day 1~2~3~4~5~H N200 2 0
-#> 3 day 1~2~3~4~5~H N200 3 0
-#> 4 day 1~2~3~4~5~H N200 4 0
-#> 5 day 1~2~3~4~5~H N200 5 0
-#> 6 day 1~2~3~4~5~H N200 FalsePositiveRate 2.35e-40
-#> 7 day 1~2~3~4~5~H N200 H 0
-#> 8 day 1~2~3~4~5~H N200 MeanDecreaseAccuracy 0
-#> 9 day 1~2~3~4~5~H N200 MeanDecreaseGini 6.00e- 2
-#> 10 day 1~2~3~4~5~H N200 SelectionFrequency 1.6 e+ 1
-#> # … with 1,000 more rows
-
-## Retrieve the sample proximities
-proximity(rf_analysis)
-#> # A tibble: 14,400 × 5
-#> Response Comparison Sample1 Sample2 Proximity
-#> <chr> <chr> <int> <dbl> <dbl>
-#> 1 day 1~2~3~4~5~H 1 1 1
-#> 2 day 1~2~3~4~5~H 1 2 0.0704
-#> 3 day 1~2~3~4~5~H 1 3 0.0580
-#> 4 day 1~2~3~4~5~H 1 4 0.0930
-#> 5 day 1~2~3~4~5~H 1 5 0.0556
-#> 6 day 1~2~3~4~5~H 1 6 0.0435
-#> 7 day 1~2~3~4~5~H 1 7 0.0556
-#> 8 day 1~2~3~4~5~H 1 8 0.0441
-#> 9 day 1~2~3~4~5~H 1 9 0.106
-#> 10 day 1~2~3~4~5~H 1 10 0
-#> # … with 14,390 more rows
-
-## Retrieve the explanatory features
-explanatoryFeatures(rf_analysis,metric = 'FalsePositiveRate',threshold = 0.05)
-#> # A tibble: 35 × 5
-#> Response Comparison Feature Metric Value
-#> <chr> <chr> <chr> <chr> <dbl>
-#> 1 day 1~2~3~4~5~H N229 FalsePositiveRate 5.75e-129
-#> 2 day 1~2~3~4~5~H N259 FalsePositiveRate 4.88e- 72
-#> 3 day 1~2~3~4~5~H N277 FalsePositiveRate 3.98e- 67
-#> 4 day 1~2~3~4~5~H N255 FalsePositiveRate 3.27e- 53
-#> 5 day 1~2~3~4~5~H N213 FalsePositiveRate 4.92e- 45
-#> 6 day 1~2~3~4~5~H N200 FalsePositiveRate 2.35e- 40
-#> 7 day 1~2~3~4~5~H N221 FalsePositiveRate 1.80e- 38
-#> 8 day 1~2~3~4~5~H N299 FalsePositiveRate 4.91e- 36
-#> 9 day 1~2~3~4~5~H N245 FalsePositiveRate 9.75e- 27
-#> 10 day 1~2~3~4~5~H N279 FalsePositiveRate 2.38e- 20
-#> # … with 25 more rows
-
Retrieve the available modelling methods and parameters.
-modellingMethods()
-
-modellingParameters(methods)
character vector of available modelling methods
## Retrieve the available modelling methods
-modellingMethods()
-#> [1] "anova" "ttest" "linearRegression" "randomForest"
-
-## Retrieve the modelling parameters for the anova method
-p <- modellingParameters('anova')
-
-## Assign the modelling parameters to analysis parameters
-mp <- analysisParameters('modelling')
-
-parameters(mp,'modelling') <- p
-
-print(mp)
-#> Parameters:
-#> modelling
-#> anova
-#> cls = class
-#> pAdjust = bonferroni
-#> comparisons = list()
-#> returnModels = FALSE
-#>
-
Return names of available modelling methods.
-modellingMethods()- - - -
Return default parameters for a given modelling method.
-modellingParameters(methods)- -
methods | -character vector of available methods.
-Use |
-
---|
-
Return the number of features.
-nFeatures(x, ...) - -# S4 method for AnalysisData -nFeatures(x) - -# S4 method for Analysis -nFeatures(x, type = "raw")- -
x | -S4 object of class AnalysisData or Analysis |
-
---|---|
... | -arguments to pass to the appropriate method |
-
type | -return features from "raw" or "pre-treated" data |
-
Return the number of samoles.
-nSamples(x, ...) - -# S4 method for AnalysisData -nSamples(x) - -# S4 method for Analysis -nSamples(x, type = "raw")- -
x | -S4 object of class AnalysisData or Analysis |
-
---|---|
... | -arguments to pass to the appropriate method |
-
type | -return features from "raw" or "pre-treated" data |
-
Calculate the class occupancies of all features in an AnalysisData
object.
occupancy(d, cls = "class")
-
-# S4 method for AnalysisData
-occupancy(d, cls = "class")
S4 object of class AnalysisData
sample information column to use for which to compute class occupancies
A tibble containing feature class proportional occupancies.
-library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
-occupancy(d,cls = 'day')
-#> # A tibble: 596 × 5
-#> day Feature N `Class total` Occupancy
-#> <fct> <chr> <dbl> <int> <dbl>
-#> 1 1 N200 1 20 0.05
-#> 2 1 N201 3 20 0.15
-#> 3 1 N202 3 20 0.15
-#> 4 1 N203 19 20 0.95
-#> 5 1 N204 4 20 0.2
-#> 6 1 N205 17 20 0.85
-#> 7 1 N206 4 20 0.2
-#> 8 1 N207 8 20 0.4
-#> 9 1 N208 7 20 0.35
-#> 10 1 N209 16 20 0.8
-#> # … with 586 more rows
-
Feature filtering based on class occupancy.
-occupancyMaximum(d, cls = "class", occupancy = 2/3)
-
-# S4 method for AnalysisData
-occupancyMaximum(d, cls = "class", occupancy = 2/3)
-
-occupancyMinimum(d, cls = "class", occupancy = 2/3)
-
-# S4 method for AnalysisData
-occupancyMinimum(d, cls = "class", occupancy = 2/3)
S4 object of class AnalysisData
sample information column name to use for class data
feature occupancy filtering threshold, below which features will be removed
An S4 object of class AnalysisData
containing the class occupancy filtered data.
Occupancy provides a useful metric by which to filter poorly represented features (features containing a majority zero or missing values). -An occupancy threshold provides a means of specifying this majority with variables below the threshold excluded from further analyses. -However, this can be complicated by an underlying class structure present within the data where a variable may be well represented within one class but not in another.
-occupancyMaximium
: Maximum occupancy threshold feature filtering. Where the maximum occupancy across all classes is above the threshold. Therefore, for a feature to be retained, only a single class needs to have an occupancy above the threshold.
occupancyMinimum
: Minimum occupancy threshold feature filtering. Where the minimum occupancy across all classes is required to be above the threshold. Therefore, for a feature to be retained, all classes would need to have an occupancy above the threshold.
## Each of the following examples shows the application
-## of the feature occupancy filtering method method and
-## then a Principle Component Analysis is plotted to show
-## its effect on the data structure.
-
-## Initial example data preparation
-library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
-## Maximum occupancy threshold feature filtering
-d %>%
- occupancyMaximum(cls = 'day') %>%
- plotPCA(cls = 'day')
-
-
-## Minimum occupancy threshold feature filtering
-d %>%
- occupancyMinimum(cls = 'day') %>%
- plotPCA(cls = 'day')
-
-
Maximum occupancy filtering of sample data.
-occupancyMaximum(dat, cls = "class", occupancy = 2/3) - -# S4 method for AnalysisData -occupancyMaximum(dat, cls = "class", occupancy = 2/3)- -
dat | -S4 object of class Data |
-
---|---|
cls | -info column to use for class data |
-
occupancy | -occupancy threshold |
-
Minimum occupancy filtering of sample data.
-occupancyMinimum(dat, cls = "class", occupancy = 2/3) - -# S4 method for AnalysisData -occupancyMinimum(dat, cls = "class", occupancy = 2/3)- -
dat | -S4 object of class Data |
-
---|---|
cls | -info column to use for class data |
-
occupancy | -occupancy threshold |
-
Get or set parameters for AnalysisParameters
or Analysis
class objects.
parameters(d, ...)
-
-# S4 method for AnalysisParameters
-parameters(d, element)
-
-# S4 method for Analysis
-parameters(d)
-
-parameters(d, element) <- value
-
-# S4 method for AnalysisParameters
-parameters(d, element) <- value
-
-# S4 method for Analysis
-parameters(d) <- value
S4 object of class AnalysisParameters
or Analysis
arguments to pass to the appropriate method
analysis element for parameters to extract or assign.
-Should be one of those returned by analysisElements()
list containing parameter values
p <- analysisParameters('pre-treatment')
-
-## extract pre-treatment parameters
-parameters(p,'pre-treatment')
-#> $QC
-#> $QC$occupancyFilter
-#> $QC$occupancyFilter$cls
-#> [1] "class"
-#>
-#> $QC$occupancyFilter$QCidx
-#> [1] "QC"
-#>
-#> $QC$occupancyFilter$occupancy
-#> 2/3
-#>
-#>
-#> $QC$impute
-#> $QC$impute$cls
-#> [1] "class"
-#>
-#> $QC$impute$QCidx
-#> [1] "QC"
-#>
-#> $QC$impute$occupancy
-#> 2/3
-#>
-#> $QC$impute$parallel
-#> [1] "variables"
-#>
-#> $QC$impute$seed
-#> [1] 1234
-#>
-#>
-#> $QC$RSDfilter
-#> $QC$RSDfilter$cls
-#> [1] "class"
-#>
-#> $QC$RSDfilter$QCidx
-#> [1] "QC"
-#>
-#> $QC$RSDfilter$RSDthresh
-#> [1] 50
-#>
-#>
-#> $QC$removeQC
-#> $QC$removeQC$cls
-#> [1] "class"
-#>
-#> $QC$removeQC$QCidx
-#> [1] "QC"
-#>
-#>
-#>
-#> $occupancyFilter
-#> $occupancyFilter$maximum
-#> $occupancyFilter$maximum$cls
-#> [1] "class"
-#>
-#> $occupancyFilter$maximum$occupancy
-#> 2/3
-#>
-#>
-#>
-#> $impute
-#> $impute$class
-#> $impute$class$cls
-#> [1] "class"
-#>
-#> $impute$class$occupancy
-#> 2/3
-#>
-#> $impute$class$seed
-#> [1] 1234
-#>
-#>
-#>
-#> $transform
-#> $transform$TICnorm
-#> named list()
-#>
-#>
-
-## set pre-treatment parameters
-parameters(p,'pre-treatment') <- preTreatmentParameters(
- list(
- remove = 'classes',
- QC = c('RSDfilter','removeQC'),
- transform = 'TICnorm'
- )
-)
-
-print(p)
-#> Parameters:
-#> pre-treatment
-#> remove
-#> classes
-#> cls = class
-#> classes = c()
-#> QC
-#> RSDfilter
-#> cls = class
-#> QCidx = QC
-#> RSDthresh = 50
-#> removeQC
-#> cls = class
-#> QCidx = QC
-#> transform
-#> TICnorm
-#>
-
parse .yaml file containing analysis parameters.
-parseParameters(path)- -
path | -file path of .yaml file to parse |
-
---|
--paramFile <- system.file('defaultParameters.yaml',package = 'metabolyseR') -p <- parseParameters(paramFile) -p -#> Parameters: -#> pre-treatment -#> QC -#> occupancyFilter -#> cls = class -#> QCidx = QC -#> occupancy = 0.667 -#> impute -#> cls = class -#> QCidx = QC -#> occupancy = 0.667 -#> RSDfilter -#> cls = class -#> QCidx = QC -#> RSDthresh = 0.5 -#> removeQC -#> cls = class -#> QCidx = QC -#> occupancyFilter -#> maximum -#> cls = class -#> occupancy = 0.667 -#> impute -#> class -#> cls = class -#> occupancy = 0.667 -#> nCores = 4 -#> clusterType = FORK -#> transform -#> TICnorm -#> -#> correlations -#> method = pearson -#> pAdjustMethod = bonferroni -#> corPvalue = 0.05-
R/plotExplanatoryHeatmap.R
- plotExplanatoryHeatmap.Rd
Plot a heatmap of explanatory features.
-plotExplanatoryHeatmap(x, ...)
-
-# S4 method for Univariate
-plotExplanatoryHeatmap(
- x,
- threshold = 0.05,
- title = "",
- distanceMeasure = "euclidean",
- clusterMethod = "ward.D2",
- featureNames = TRUE,
- dendrogram = TRUE,
- featureLimit = Inf
-)
-
-# S4 method for RandomForest
-plotExplanatoryHeatmap(
- x,
- metric = "FalsePositiveRate",
- threshold = 0.05,
- title = "",
- distanceMeasure = "euclidean",
- clusterMethod = "ward.D2",
- featureNames = TRUE,
- dendrogram = TRUE,
- featureLimit = Inf
-)
-
-# S4 method for list
-plotExplanatoryHeatmap(
- x,
- threshold = 0.05,
- distanceMeasure = "euclidean",
- clusterMethod = "ward.D2",
- featureNames = TRUE,
- featureLimit = Inf
-)
-
-# S4 method for Analysis
-plotExplanatoryHeatmap(
- x,
- threshold = 0.05,
- distanceMeasure = "euclidean",
- clusterMethod = "ward.D2",
- featureNames = TRUE,
- featureLimit = Inf
-)
object of class Univariate
, RandomForest
or
-Analysis
arguments to pass to the appropriate method
score threshold to use for specifying explanatory features
plot title
distance measure to use for clustering. See details.
clustering method to use. See details
should feature names be plotted?
TRUE/FALSE. Should the dendrogram be plotted?
The maximum number of features to plot
importance metric on which to retrieve explanatory features
Distance measures can be one of any that can be used for the method
argument of dist()
.
Cluster methods can be one of any that can be used for the method
argument of hclust()
.
library(metaboData)
-x <- analysisData(data = abr1$neg[,200:300],info = abr1$fact)
-
-## random forest classification example
-random_forest <- randomForest(x,cls = 'day')
-
-plotExplanatoryHeatmap(random_forest)
-
-
-## random forest regression example
-random_forest <- randomForest(x,cls = 'injorder')
-
-plotExplanatoryHeatmap(random_forest,metric = '%IncMSE',threshold = 2)
-
-
Plot the trend of a feature.
-plotFeature(analysis, feature, cls = "class", label = NULL, labelSize = 2, ...)
-
-# S4 method for AnalysisData
-plotFeature(analysis, feature, cls = "class", label = NULL, labelSize = 2)
-
-# S4 method for Analysis
-plotFeature(
- analysis,
- feature,
- cls = "class",
- label = NULL,
- labelSize = 2,
- type = "pre-treated"
-)
an object of class AnalysisData
or`` Analysis`
feature name to plot
information column to use for class labels
information column to use for sample labels
sample label size
arguments to pass to the appropriate method
raw
or pre-treated
data to plot
library(metaboData)
-
-d <- analysisData(abr1$neg,abr1$fact)
-
-## Plot a categorical response variable
-plotFeature(d,'N133',cls = 'day')
-
-
-## Plot a continuous response variable
-plotFeature(d,'N133',cls = 'injorder')
-
-
Plot Univariate or random forest feature importance.
-plotImportance(x, ...)
-
-# S4 method for Univariate
-plotImportance(x, response = "class", rank = TRUE, threshold = 0.05)
-
-# S4 method for RandomForest
-plotImportance(x, metric = "FalsePositiveRate", rank = TRUE)
-
-# S4 method for list
-plotImportance(x, metric = "FalsePositiveRate")
S4 object of class Univariate
or RandomForest
arguments to pass to specific method
response results to plot
rank feature order for plotting
explanatory threshold line for the output plot
importance metric to plot
library(metaboData)
-
-x <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
- keepClasses(cls = 'day',classes = c('H','1','5')) %>%
- occupancyMaximum(cls = 'day') %>%
- transformTICnorm()
-
-rf <- randomForest(x,cls = 'day')
-
-plotImportance(rf,rank = FALSE)
-
-
Plot linear discriminant analysis results of pre-treated data
-plotLDA(
- analysis,
- cls = "class",
- label = NULL,
- scale = TRUE,
- center = TRUE,
- xAxis = "DF1",
- yAxis = "DF2",
- shape = FALSE,
- ellipses = TRUE,
- title = "PC-LDA",
- legendPosition = "bottom",
- labelSize = 2,
- ...
-)
-
-# S4 method for AnalysisData
-plotLDA(
- analysis,
- cls = "class",
- label = NULL,
- scale = TRUE,
- center = TRUE,
- xAxis = "DF1",
- yAxis = "DF2",
- shape = FALSE,
- ellipses = TRUE,
- title = "PC-LDA",
- legendPosition = "bottom",
- labelSize = 2
-)
-
-# S4 method for Analysis
-plotLDA(
- analysis,
- cls = "class",
- label = NULL,
- scale = TRUE,
- center = TRUE,
- xAxis = "DF1",
- yAxis = "DF2",
- shape = FALSE,
- ellipses = TRUE,
- title = "PC-LDA",
- legendPosition = "bottom",
- labelSize = 2,
- type = "raw"
-)
S4 object of class AnalysisData
or Analysis
name of sample information column to use for class labels
name of sample information column to use for sample labels. Set to NULL for no labels.
scale the data
center the data
principle component to plot on the x-axis
principle component to plot on the y-axis
TRUE/FALSE use shape aesthetic for plot points. -Defaults to TRUE when the number of classes is greater than 12
TRUE/FALSE, plot multivariate normal distribution 95\ -confidence ellipses for each class
plot title
legend position to pass to legend.position argument
-of ggplot2::theme
. Set to "none" to remove legend.
label size. Ignored if label
is NULL
arguments to pass to the appropriate method
raw
or pre-treated
data to plot
library(metaboData)
-
-d <- analysisData(abr1$neg,abr1$fact) %>%
- occupancyMaximum(cls = 'day')
-
-## LDA plot
-plotLDA(d,cls = 'day')
-
-
Plot multidimensional scaling plot for a RandomForest
class object.
plotMDS(
- x,
- cls = "class",
- label = NULL,
- shape = FALSE,
- ellipses = TRUE,
- title = "",
- legendPosition = "bottom",
- labelSize = 2
-)
-
-# S4 method for RandomForest
-plotMDS(
- x,
- cls = "class",
- label = NULL,
- shape = FALSE,
- ellipses = TRUE,
- title = "",
- legendPosition = "bottom",
- labelSize = 2
-)
-
-# S4 method for list
-plotMDS(
- x,
- label = NULL,
- shape = FALSE,
- ellipses = TRUE,
- title = "",
- legendPosition = "bottom",
- labelSize = 2
-)
S4 object of class RandomForest
sample information column to use for sample labelling, -Set to NULL for no labelling.
sample information column to use for sample labels. Set to NULL for no labels.
TRUE/FALSE use shape aesthetic for plot points. -Defaults to TRUE when the number of classes is greater than 12
TRUE/FALSE, plot multivariate normal distribution 95% -confidence ellipses for each class
plot title
legend position to pass to legend.position argument
-of ggplot2::theme
. Set to "none" to remove legend.
label size. Ignored if label
is NULL
library(metaboData)
-
-x <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
- occupancyMaximum(cls = 'day') %>%
- transformTICnorm()
-
-rf <- randomForest(x,cls = 'day')
-
-plotMDS(rf,cls = 'day')
-
-
Plot random forest model performance metrics
-plotMetrics(x, response = "class")
-
-# S4 method for RandomForest
-plotMetrics(x)
-
-# S4 method for list
-plotMetrics(x)
S4 object of class RandomForest
response results to plot
library(metaboData)
-
-x <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
- keepClasses(cls = 'day',classes = c('H','1','5')) %>%
- occupancyMaximum(cls = 'day') %>%
- transformTICnorm()
-
-rf <- randomForest(x,cls = 'day',binary = TRUE)
-
-plotMetrics(rf,response = 'day')
-
-
Plot class occupancy distributions.
-plotOccupancy(x, cls = "class", ...)
-
-# S4 method for AnalysisData
-plotOccupancy(x, cls = "class")
-
-# S4 method for Analysis
-plotOccupancy(x, cls = "class", type = "raw")
S4 object of class AnalysisData
or Analysis
sample information column to use for class labels
arguments to pass to the appropriate method
raw
or preTreated
data to plot
library(metaboData)
-
-d <- analysisData(abr1$neg,abr1$fact)
-
-## Plot class occupancy distributions
-plotOccupancy(d,cls = 'day')
-
-
Plot Principle Component Analysis results.
-plotPCA(
- analysis,
- cls = "class",
- label = NULL,
- scale = TRUE,
- center = TRUE,
- xAxis = "PC1",
- yAxis = "PC2",
- shape = FALSE,
- ellipses = TRUE,
- title = "PCA",
- legendPosition = "bottom",
- labelSize = 2,
- ...
-)
-
-# S4 method for AnalysisData
-plotPCA(
- analysis,
- cls = "class",
- label = NULL,
- scale = TRUE,
- center = TRUE,
- xAxis = "PC1",
- yAxis = "PC2",
- shape = FALSE,
- ellipses = TRUE,
- title = "Principle Component Analysis (PCA)",
- legendPosition = "bottom",
- labelSize = 2
-)
-
-# S4 method for Analysis
-plotPCA(
- analysis,
- cls = "class",
- label = NULL,
- scale = TRUE,
- center = TRUE,
- xAxis = "PC1",
- yAxis = "PC2",
- shape = FALSE,
- ellipses = TRUE,
- title = "PCA",
- legendPosition = "bottom",
- labelSize = 2,
- type = "raw"
-)
object of class AnalysisData
or Analysis
name of class information column to use for sample labelling
name of class information column to use for sample labels. Set to NULL for no labels.
scale the data
center the data
principle component to plot on the x-axis
principle component to plot on the y-axis
TRUE/FALSE use shape aesthetic for plot points. -Defaults to TRUE when the number of classes is greater than 12
TRUE/FALSE, plot multivariate normal distribution 95\ -confidence ellipses for each class
plot title
legend position to pass to legend.position argument
-of ggplot2::theme
. Set to "none" to remove legend.
label size. Ignored if label
is NULL
arguments to pass to the appropriate method
raw
or pre-treated
data to plot
library(metaboData)
-
-d <- analysisData(abr1$neg,abr1$fact) %>%
- occupancyMaximum(cls = 'day')
-
-## PCA plot
-plotPCA(d,cls = 'day')
-
-
Plot receiver operator characteristic curves for a
-RandomForest
class object.
plotROC(x, title = "", legendPosition = "bottom")
-
-# S4 method for RandomForest
-plotROC(x, title = "", legendPosition = "bottom")
-
-# S4 method for list
-plotROC(x, title = "", legendPosition = "bottom")
S4 object of class RandomForest
plot title
legend position to pass to legend.position
-argument of ggplot2::theme
. Set to "none" to remove legend.
library(metaboData)
-
-x <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
- occupancyMaximum(cls = 'day') %>%
- transformTICnorm()
-
-rf <- randomForest(x,cls = 'day')
-
-plotROC(rf)
-
-
Plot RSD distributions of raw data in quality control samples.
-plotRSD(analysis, cls = "class", ...)
-
-# S4 method for AnalysisData
-plotRSD(analysis, cls = "class")
-
-# S4 method for Analysis
-plotRSD(analysis, cls = "class", type = "raw")
object of class AnalysisData
or Analysis
information column to use for class labels
arguments to pass to the appropriate method
raw
or pre-treated
data to plot
library(metaboData)
-
-d <- analysisData(abr1$neg,abr1$fact)
-
-## Plot class RSD distributions
-plotRSD(d,cls = 'day')
-#> Warning: Removed 716 rows containing non-finite values (stat_density).
-#> Warning: Removed 6 row(s) containing missing values (geom_path).
-
-
A multidimensional scaling (MDS) plot of supervised random forest analysis
-plotSupervisedRF(
- x,
- cls = "class",
- rf = list(),
- label = NULL,
- shape = FALSE,
- ellipses = TRUE,
- ROC = TRUE,
- seed = 1234,
- title = "",
- legendPosition = "bottom",
- labelSize = 2,
- ...
-)
-
-# S4 method for AnalysisData
-plotSupervisedRF(
- x,
- cls = "class",
- rf = list(),
- label = NULL,
- shape = FALSE,
- ellipses = TRUE,
- ROC = TRUE,
- seed = 1234,
- title = "",
- legendPosition = "bottom",
- labelSize = 2
-)
-
-# S4 method for Analysis
-plotSupervisedRF(
- x,
- cls = "class",
- rf = list(),
- label = NULL,
- shape = FALSE,
- ellipses = TRUE,
- ROC = TRUE,
- seed = 1234,
- title = "",
- legendPosition = "bottom",
- labelSize = 2,
- type = "raw"
-)
object of class AnalysisData
or Analysis
containing analysis results
information column to use for sample classes
list of additional parameters to pass to randomForest
information column to use for sample labels. Set to NULL
for no labels.
TRUE/FALSE use shape aesthetic for plot points. -Defaults to TRUE when the number of classes is greater than 12
TRUE/FALSE, plot multivariate normal distribution 95% -confidence ellipses for each class
should receiver-operator characteristics be plotted?
random number seed
plot title
legend position to pass to legend.position argument
-of ggplot2::theme
. Set to "none" to remove legend.
label size. Ignored if label
is NULL
arguments to pass to the appropriate method
raw
or pre-treated
data to plot
library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
-## Supervised random forest MDS plot
-plotSupervisedRF(d,cls = 'day')
-
-
Plot total ion counts of sample data.
-plotTIC(analysis, by = "injOrder", colour = "block", ...)
-
-# S4 method for AnalysisData
-plotTIC(analysis, by = "injOrder", colour = "block")
-
-# S4 method for Analysis
-plotTIC(
- analysis,
- by = "injOrder",
- colour = "block",
- type = c("raw", "pre-treated")
-)
S4 object of class AnalysisData
or Analysis
information column to plot against
information column to provide colour labels
arguments to pass to the appropriate method
raw
or pre-treated
sample data
library(metaboData)
-
-d <- analysisData(abr1$neg,abr1$fact)
-
-## Plot sample TIVs
-plotTIC(d,by = 'injorder',colour = 'day')
-
-
-plotTIC(d,by = 'day',colour = 'day')
-
-
A multidimensional scaling (MDS) plot of unsupervised random forest analysis
-plotUnsupervisedRF(
- x,
- cls = "class",
- rf = list(),
- label = NULL,
- shape = FALSE,
- ellipses = TRUE,
- seed = 1234,
- title = "",
- legendPosition = "bottom",
- labelSize = 2,
- ...
-)
-
-# S4 method for AnalysisData
-plotUnsupervisedRF(
- x,
- cls = "class",
- rf = list(),
- label = NULL,
- shape = FALSE,
- ellipses = TRUE,
- seed = 1234,
- title = "",
- legendPosition = "bottom",
- labelSize = 2
-)
-
-# S4 method for Analysis
-plotUnsupervisedRF(
- x,
- cls = "class",
- rf = list(),
- label = NULL,
- shape = FALSE,
- ellipses = TRUE,
- seed = 1234,
- title = "",
- legendPosition = "bottom",
- labelSize = 2,
- type = "raw"
-)
object of class AnalysisData
or Analysis
sample information column to use for sample labelling
list of additional parameters to pass to randomForest
info column to use for sample labels. Set to NULL for no labels.
TRUE/FALSE use shape aesthetic for plot points. -Defaults to TRUE when the number of classes is greater than 12
TRUE/FALSE, plot multivariate normal distribution 95% -confidence ellipses for each class
random number seed
plot title
legend position to pass to legend.position argument
-of ggplot2::theme
. Set to "none" to remove legend.
label size. Ignored if label
is NULL
arguments to pass to the appropriate method
raw
or pre-treated
data to plot
library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
-## Unsupervised random forest MDS plot
-plotUnsupervisedRF(d,cls = 'day')
-
-
Return pre-treatment elements, methods and parameters.
-preTreatmentElements()
-
-preTreatmentMethods(element)
-
-preTreatmentParameters(methods)
pre-treatment element name
a named list of element methods
## Return the availalble pre-treatment elements
-preTreatmentElements()
-#> [1] "aggregate" "correction" "impute" "keep"
-#> [5] "occupancyFilter" "QC" "remove" "transform"
-
-## Return the available pre-treatment methods for the remove element
-preTreatmentMethods('remove')
-#> [1] "classes" "features" "samples"
-
-## Define some default pre-treatment parameters
-p <- preTreatmentParameters(
- list(
- remove = 'classes',
- QC = c('RSDfilter','removeQC'),
- transform = 'TICnorm'
- )
-)
-
-## Assign the pre-treatment parameters to analysis parameters
-ap <- analysisParameters('pre-treatment')
-parameters(ap,'pre-treatment') <- p
-
-print(ap)
-#> Parameters:
-#> pre-treatment
-#> remove
-#> classes
-#> cls = class
-#> classes = c()
-#> QC
-#> RSDfilter
-#> cls = class
-#> QCidx = QC
-#> RSDthresh = 50
-#> removeQC
-#> cls = class
-#> QCidx = QC
-#> transform
-#> TICnorm
-#>
-
Get or set an AnalysisData object from -the pre-treated slot of the Analysis class.
-preTreated(x) - -preTreated(x) <- value - -# S4 method for Analysis -preTreated(x) - -# S4 method for Analysis -preTreated(x) <- value- -
x | -S4 object of class Analysis |
-
---|---|
value | -S4 object of class AnalysisData |
-
Return names of available pre-treatment elements
-preTreatmentElements()- - - -
Perform random forest on an AnalysisData
object
randomForest(
- x,
- cls = "class",
- rf = list(),
- reps = 1,
- binary = FALSE,
- comparisons = list(),
- perm = 0,
- returnModels = FALSE,
- seed = 1234
-)
-
-# S4 method for AnalysisData
-randomForest(
- x,
- cls = "class",
- rf = list(),
- reps = 1,
- binary = FALSE,
- comparisons = list(),
- perm = 0,
- returnModels = FALSE,
- seed = 1234
-)
S4 object of class AnalysisData
vector of sample information columns to use for response variable information. Set to NULL for unsupervised.
named list of arguments to pass to randomForest::randomForest
number of repetitions to perform
TRUE/FALSE should binary comparisons be performed. Ignored for unsupervised and regression. Ignored if comparisons
specified.
list of comparisons to perform. -Ignored for unsupervised and regression. See details.
number of permutations to perform. Ignored for unsupervised.
TRUE/FALSE should model objects be returned.
random number seed
An S4 object of class RandomForest
.
Specified class comparisons should be given as a list named
-according to cls
. Comparisons should be given as class names
-separated by '~' (eg. '1~2~H').
library(metaboData)
-
-x <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
- occupancyMaximum(cls = 'day') %>%
- transformTICnorm()
-
-rf <- randomForest(x,cls = 'day')
-
-plotMDS(rf,cls = 'day')
-
-
Get or set an AnalysisData object from -the raw slot of the Analysis class.
-raw(x) - -raw(x) <- value - -# S4 method for Analysis -raw(x) - -# S4 method for Analysis -raw(x) <- value- -
x | -S4 object of class Analysis |
-
---|---|
value | -S4 object of class AnalysisData |
-
Re-analyse an object of class Analysis using -specified parameters.
-reAnalyse(analysis, parameters = analysisParameters(), verbose = TRUE)- -
analysis | -an object of class Analysis containing previous -analysis results |
-
---|---|
parameters | -an object of class Parameters containing parameters for -re-analysis |
-
verbose | -should output be printed to the console |
-
-library(metaboData) - -## Generate analysis parameters -p <- analysisParameters(c('pre-treatment','modelling')) - -## Alter pre-treatment and modelling parameters to use different methods -parameters(p,'pre-treatment') <- preTreatmentParameters( - list(occupancyFilter = 'maximum', - transform = 'TICnorm') -) -parameters(p,'modelling') <- modellingParameters('anova') - -## Change "cls" and "nCores" parameters -changeParameter(p,'cls') <- 'day' -changeParameter(p,'nCores') <- 2 - -## Run analysis using a subset of the abr1 negative mode data set -analysis <- metabolyse(abr1$neg[,1:200], - abr1$fact, - p) -#>#>#> Parameters: -#> -#> -#> -#> -#> -#> -#> -#> -#> -#> -#> -#> -#> -#>#>#> Pre-treatment …#> Pre-treatment ✓ [0.9S]#> Modelling …#> -#>#>-#> -#>#> Modelling ✓ [0.7S]#>#> -#> Complete! [1.7S]-## Re-analyse to include correlation analysis -analysis <- reAnalyse(analysis, - parameters = analysisParameters('correlations')) -#> -#> metabolyseR v0.14.0 Thu Apr 15 21:57:55 2021 -#> ________________________________________________________________________________ -#> Parameters: -#> correlations -#> method = pearson -#> pAdjustMethod = bonferroni -#> corPvalue = 0.05 -#> ________________________________________________________________________________ -#>#> Correlations …#> Error in (function (cl, name, valueClass) { ClassDef <- getClass(cl) slotClass <- ClassDef@slots[[name]] if (is.null(slotClass)) stop(gettextf("%s is not a slot in class %s", sQuote(name), dQuote(cl)), domain = NA) if (.identC(slotClass, valueClass)) return(TRUE) ok <- possibleExtends(valueClass, slotClass, ClassDef2 = getClassDef(slotClass, where = .classEnv(ClassDef))) if (isFALSE(ok)) stop(gettextf("assignment of an object of class %s is not valid for @%s in an object of class %s; is(value, \"%s\") is not TRUE", dQuote(valueClass), sQuote(name), dQuote(cl), slotClass), domain = NA) TRUE})(structure("AnalysisData", package = "metabolyseR"), "correlations", c("tbl_df", "tbl", "data.frame")): ‘correlations’ is not a slot in class “AnalysisData”
These objects are imported from other packages. Follow the links -below to see their documentation.
-%>%
Exclusion of samples, classes or features from an AnalysisData
object.
removeClasses(d, cls = "class", classes = c())
-
-# S4 method for AnalysisData
-removeClasses(d, cls = "class", classes = c())
-
-removeFeatures(d, features = character())
-
-# S4 method for AnalysisData
-removeFeatures(d, features = character())
-
-removeSamples(d, idx = "fileOrder", samples = c())
-
-# S4 method for AnalysisData
-removeSamples(d, idx = "fileOrder", samples = c())
S4 object of class AnalysisData
info column to use for class information
classes to remove
features to remove
info column containing sample indexes
sample indexes to remove
An S4 object of class AnalysisData
with samples, classes or features removed.
removeClasses
: Remove classes.
removeFeatures
: Remove features.
removeSamples
: Remove samples.
library(metaboData)
- d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
- ## Remove classes
- d %>%
- removeClasses(cls = 'day',classes = 'H')
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 100
-#> Features: 101
-#> Info: 9
-#>
-
- ## Remove features
- d %>%
- removeFeatures(features = c('N200','N201'))
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 120
-#> Features: 99
-#> Info: 9
-#>
-
- ## Remove samples
- d %>%
- removeSamples(idx = 'injorder',samples = c(1,10))
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 118
-#> Features: 101
-#> Info: 9
-#>
-
Remove classes from an AnalysisData object.
-removeClasses(d, cls = "class", classes = c()) - -# S4 method for AnalysisData -removeClasses(d, cls = "class", classes = c())- -
d | -S4 object of class AnalysisData |
-
---|---|
cls | -info column to use for class information |
-
classes | -classes to remove |
-
Remove samples from an AnalysisData object.
-removeSamples(d, idx = "fileOrder", samples = c()) - -# S4 method for AnalysisData -removeSamples(d, idx = "fileOrder", samples = c())- -
d | -S4 object of class AnalysisData |
-
---|---|
idx | -info column containing sample indexes |
-
samples | -sample indexes to remove |
-
ROC curves for out-of-bag random forest predictions.
-roc(x)
-
-# S4 method for RandomForest
-roc(x)
-
-# S4 method for list
-roc(x)
-
-# S4 method for Analysis
-roc(x)
S4 object of class RandomForest
, Analysis
or a list
A tibble containing the ROC curves.
-library(metaboData)
-
-x <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
- occupancyMaximum(cls = 'day') %>%
- transformTICnorm()
-
-rf <- randomForest(x,cls = 'day')
-
-roc(rf)
-#> # A tibble: 711 × 6
-#> Response Comparison Class .threshold specificity sensitivity
-#> <chr> <chr> <chr> <dbl> <dbl> <dbl>
-#> 1 day 1~2~3~4~5~H 1 -Inf 0 1
-#> 2 day 1~2~3~4~5~H 1 0 0 1
-#> 3 day 1~2~3~4~5~H 1 0.00503 0.01 1
-#> 4 day 1~2~3~4~5~H 1 0.00538 0.02 1
-#> 5 day 1~2~3~4~5~H 1 0.0103 0.03 1
-#> 6 day 1~2~3~4~5~H 1 0.0105 0.04 1
-#> 7 day 1~2~3~4~5~H 1 0.0117 0.05 1
-#> 8 day 1~2~3~4~5~H 1 0.0144 0.06 1
-#> 9 day 1~2~3~4~5~H 1 0.0157 0.07 1
-#> 10 day 1~2~3~4~5~H 1 0.0222 0.08 1
-#> # … with 701 more rows
-
Calculate relative standard deviation (RSD) percentage values for each -feature per class for a given sample information column.
-rsd(x, cls = "class")
-
-# S4 method for AnalysisData
-rsd(x, cls = "class")
S4 object of class AnalysisData
sample information column to use for class structure
A tibble containing the computed RSD values.
-library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact)
-
-rsd(d,cls = 'day')
-#> # A tibble: 606 × 5
-#> day Feature Mean SD RSD
-#> <fct> <chr> <dbl> <dbl> <dbl>
-#> 1 1 N200 0.224 1.00 447.
-#> 2 1 N201 0.228 0.946 415.
-#> 3 1 N202 0.0538 0.151 280.
-#> 4 1 N203 1.34 1.03 76.5
-#> 5 1 N204 0.0833 0.202 242.
-#> 6 1 N205 1.55 2.29 148.
-#> 7 1 N206 0.112 0.360 320.
-#> 8 1 N207 0.220 0.396 180.
-#> 9 1 N208 0.124 0.225 182.
-#> 10 1 N209 1.37 2.03 148.
-#> # … with 596 more rows
-
Return sample info from an AnalysisData or Analysis object.
-sinfo(x, ...) - -sinfo(x, ...) <- value - -# S4 method for AnalysisData -sinfo(x) - -# S4 method for AnalysisData -sinfo(x) <- value - -# S4 method for Analysis -sinfo(x, type = "raw", value) - -# S4 method for Analysis -sinfo(x, type = "raw") <- value- -
x | -S4 object of class AnalysisData or Analysis |
-
---|---|
... | -arguments to pass to the appropriate method |
-
value | -tibble containing sample info |
-
type | -sample information type to extract or set. -Should be one of "raw" or "pre-treated" |
-
Split an object of class AnalysisData
into a list based
-a class grouping variable.
split(x, cls = "class")
-
-# S4 method for AnalysisData
-split(x, cls = "class")
S4 object of class AnalysisData
sample information column to use for splitting
A list of AnalysisData
objects.
library(metaboData)
-
-d <- analysisData(abr1$neg,abr1$fact)
-
-## Split the data set based on the 'day' class information column
-d <- split(d,cls = 'day')
-
-print(d)
-#> $`1`
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 20
-#> Features: 2000
-#> Info: 9
-#>
-#>
-#> $`2`
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 20
-#> Features: 2000
-#> Info: 9
-#>
-#>
-#> $`3`
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 20
-#> Features: 2000
-#> Info: 9
-#>
-#>
-#> $`4`
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 20
-#> Features: 2000
-#> Info: 9
-#>
-#>
-#> $`5`
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 20
-#> Features: 2000
-#> Info: 9
-#>
-#>
-#> $H
-#>
-#> AnalysisData object containing:
-#>
-#> Samples: 20
-#> Features: 2000
-#> Info: 9
-#>
-#>
-
Methods for data scaling, transformation and normalisation.
-transformArcSine(d)
-
-# S4 method for AnalysisData
-transformArcSine(d)
-
-transformAuto(d)
-
-# S4 method for AnalysisData
-transformAuto(d)
-
-transformCenter(d)
-
-# S4 method for AnalysisData
-transformCenter(d)
-
-transformLevel(d)
-
-# S4 method for AnalysisData
-transformLevel(d)
-
-transformLn(d, add = 1)
-
-# S4 method for AnalysisData
-transformLn(d, add = 1)
-
-transformLog10(d, add = 1)
-
-# S4 method for AnalysisData
-transformLog10(d, add = 1)
-
-transformPareto(d)
-
-# S4 method for AnalysisData
-transformPareto(d)
-
-transformRange(d)
-
-# S4 method for AnalysisData
-transformRange(d)
-
-transformSQRT(d)
-
-# S4 method for AnalysisData
-transformSQRT(d)
-
-transformTICnorm(d)
-
-# S4 method for AnalysisData
-transformTICnorm(d)
-
-transformVast(d)
-
-# S4 method for AnalysisData
-transformVast(d)
S4 object of class AnalysisData
value to add prior to transformation
An S4 object of class AnalysisData
containing the transformed data.
Prior to downstream analyses, metabolomics data often require transformation to fulfil the assumptions of a particular statistical/data mining technique. -Before applying a transformation, it is important to consider the effects that the transformation will have on the data, as this can greatly effect the outcome of further downstream analyses. -It is also important to consider at what stage in the pre-treatment routine a transformation is applied as this too could introduce artefacts into the data. -The best practice is to apply a transformation as the last in a pre-treatment routine after all other steps have been taken. -There are a wide range of transformation methods available that are commonly used for the analysis of metabolomics data.
-transformArcSine
: Arc-sine transformation.
transformAuto
: Auto scaling.
transformCenter
: Mean centring.
transformLevel
: Level scaling.
transformLn
: Natural logarithmic transformation.
transformLog10
: Logarithmic transformation.
transformPareto
: Pareto scaling.
transformRange
: Range scaling. Also known as min-max scaling.
transformSQRT
: Square root transformation.
transformTICnorm
: Total ion count normalisation.
transformVast
: Vast scaling.
-## Each of the following examples shows the application of the transformation and then
-## a Linear Discriminant Analysis is plotted to show it's effect on the data structure.
-
-## Initial example data preparation
-library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
- occupancyMaximum(occupancy = 2/3)
-
-d %>%
- plotLDA(cls = 'day')
-
-
-
-## Arc-sine transformation
-d %>%
- transformArcSine() %>%
- plotLDA(cls = 'day')
-
-
-## Auto scaling
-d %>%
- transformAuto() %>%
- plotLDA(cls = 'day')
-
-
-## Mean centring
-d %>%
- transformCenter()%>%
- plotLDA(cls = 'day')
-
-
-## Level scaling
-d %>%
- transformLevel() %>%
- plotLDA(cls = 'day')
-
-
-## Natural logarithmic transformation
-d %>%
- transformLn() %>%
- plotLDA(cls = 'day')
-
-
-## Logarithmic transformation
-d %>%
- transformLog10()%>%
- plotLDA(cls = 'day')
-
-
-## Pareto scaling
-d %>%
- transformPareto() %>%
- plotLDA(cls = 'day')
-
-
-## Range scaling
-d %>%
- transformRange() %>%
- plotLDA(cls = 'day')
-
-
-## Square root scaling
-d %>%
- transformSQRT() %>%
- plotLDA(cls = 'day')
-
-
-## Total ion count nromalisation
-d %>%
- transformTICnorm() %>%
- plotLDA(cls = 'day')
-
-
-## Vast scaling
-d %>%
- transformVast() %>%
- plotLDA(cls = 'day')
-
-
Welch's t-test
-S4 object of class AnalysisData
vector of sample information column names to analyse
p value adjustment method
named list of binary comparisons to analyse
should models be returned
An S4 object of class Univariate
.
library(metaboData)
-
-d <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
- keepClasses(cls = 'day',classes = c('H','5'))
-
-## Perform t-test
-ttest_analysis <- ttest(d,cls = 'day')
-
-## Extract significant features
-explanatoryFeatures(ttest_analysis)
-#> # A tibble: 11 × 14
-#> Response Comparison Feature estimate estimate1 estimate2 statistic p.value
-#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 day 5~H N277 65.4 79.2 13.8 7.77 1.58e-7
-#> 2 day 5~H N299 7.68 8.99 1.31 6.36 2.53e-6
-#> 3 day 5~H N229 50.3 55.2 4.93 5.96 8.60e-6
-#> 4 day 5~H N295 4.19 5.12 0.937 5.56 8.65e-6
-#> 5 day 5~H N233 -4.65 2.68 7.33 -5.00 1.69e-5
-#> 6 day 5~H N267 27.3 48.1 20.8 4.79 2.96e-5
-#> 7 day 5~H N245 18.0 19.9 1.94 4.92 9.00e-5
-#> 8 day 5~H N279 7.64 9.21 1.57 4.61 1.63e-4
-#> 9 day 5~H N278 4.14 6.27 2.12 4.45 1.76e-4
-#> 10 day 5~H N281 3.02 3.72 0.701 4.47 1.92e-4
-#> 11 day 5~H N272 2.99 3.71 0.722 4.30 2.49e-4
-#> # … with 6 more variables: parameter <dbl>, conf.low <dbl>, conf.high <dbl>,
-#> # method <chr>, alternative <chr>, adjusted.p.value <dbl>
-