Merge pull request #180 from uclahs-cds/nzeltser-update-spark-tempdir

alkaZeltser · web-flow · commit 29f1440a8d6a · 2022-03-21T14:24:06.000-07:00
Set default spark tempdir param and add checks
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,9 +13,11 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 - Standardize output and log directory structure
 - Update index file extension from all processes to .bam.bai 
 - Standardize config files
+- Remove spark_temp_dir parameter from config template
 
 ### Added
 - Intermediate file removal
+- Spark tempdir permission checks
 
 ## [7.3.1] - 2022-01-14
 ### Changed
diff --git a/README.md b/README.md
@@ -165,7 +165,8 @@ After marking dup BAM files, the BAM files are then indexed by utilizing Picard
 | `cache_intermediate_pipeline_steps` | yes | boolean | Enable cahcing to resume pipeline and the end of the last successful process completion when a pipeline fails (if true the default submission script must be modified). |
 | `mark_duplicates` | no | boolean | Disable processes which mark duplicates. When false, the pipeline stops at the sorting step, outputting a sorted, indexed, unmerged BAM with unmarked duplicates. Recommended for high coverage targeted panel sequencing datasets. Defaults as true to mark duplicates as usual.|
 | `enable_spark` | yes | boolean | Enable use of Spark processes. When true, `MarkDuplicatesSpark` will be used. When false, `MarkDuplicates` will be used. Default value is true. |
-| `spark_temp_dir` | yes | path | Path to temp dir for Spark processes. Defaults to `/scratch`. |
+| `spark_temp_dir` | no | path | Path to temp dir for Spark processes. When included in the sample config file, Spark intermediate files will be saved to this directory. Defaults to `/scratch` and should only be changed for testing/development. Changing this directory to `/hot` or `/tmp` can lead to high server latency and potential disk space limitations, respectively.|
+| `work_dir` | no | path | Path of working directory for Nextflow. When included in the sample config file, Nextflow intermediate files and logs will be saved to this directory. With ucla_cds, the default is `/scratch` and should only be changed for testing/development. Changing this directory to `/hot` or `/tmp` can lead to high server latency and potential disk space limitations, respectively. |
 | `max_number_of_parallel_jobs` | no | int | The maximum number of jobs or steps of the pipeline that can be ran in parallel. Default is 1. Be very cautious setting this to any value larger than 1, as it may cause out-of-memory error. It may be helpful when running on a big memory computing node. |
 | `bwa_mem_number_of_cpus` | no | int | Number of cores to use for BWA-MEM2. If not set, this will be calculated to ensure at least 2.5Gb memory per core. |
 | `blcds_registered_dataset_input` | yes | boolean | Input FASTQs are from the Boutros Lab data registry. |
diff --git a/pipeline/config/methods.config b/pipeline/config/methods.config
@@ -217,21 +217,30 @@ methods {
         if (params.ucla_cds) {
             /**
              * By default, if the /scratch directory exists, set it as the Nextflow working directory
+             * and Spark temp directory.
              * If config file specified work_dir, set it as the Nextflow working directory
+             * If config file specified spark_temp_dir, set it as the Spark temp directory
              * 
-             * WARNING: changing this directory can lead to high server latency and
-             * potential disk space limitations. Change with caution! The 'workDir'
-             * in Nextflow determines the location of intermediate and temporary files.
+             * WARNING: changing these directories can lead to high server latency and
+             * potential disk space limitations. Change with caution! Handles creation of
+             * directories which don't already exist e.g. '/scratch/test/'
+             * The 'workDir' in Nextflow determines the location of intermediate and temporary files.
              */
             params.work_dir = (params.containsKey('work_dir') && params.work_dir) ? params.work_dir : '/scratch'
             if (methods.check_workdir_permissions(params.work_dir)) {
                 workDir = params.work_dir
             }
+            
+            params.spark_temp_dir = (params.containsKey('spark_temp_dir') && params.spark_temp_dir && methods.check_workdir_permissions(params.spark_temp_dir)) ? params.spark_temp_dir : '/scratch'
+
         } else {
             // If work_dir was specified as a param and exists or can be created, set workDir. Otherwise, let Nextflow's default behavior dictate workDir
             if (params.containsKey('work_dir') && params.work_dir && methods.check_workdir_permissions(params.work_dir)) {
                 workDir = params.work_dir
             }
+
+            // If spark_temp_dir was specified as a param and exists or can be created, set as spark tempdir. Otherwise, set as workDir.
+            params.spark_temp_dir = (params.containsKey('spark_temp_dir') && params.spark_temp_dir && methods.check_workdir_permissions(params.spark_temp_dir)) ? params.spark_temp_dir : workDir
         }
     }
 
diff --git a/pipeline/config/template.config b/pipeline/config/template.config
@@ -36,8 +36,6 @@ params {
     // Spark options
     // By default, the Spark process MarkDuplicatesSpark will be used. Set to false to disable Spark process and use MarkDuplicates (Picard) instead
     enable_spark = true
-    // Default Spark temp dir is /scratch. Update if necessary
-    spark_temp_dir = "/scratch"
 
     // set to true if the data input fastq files are registered in the Boutros Lab.
     blcds_registered_dataset_input = false