Merge pull request #655 from FriederikeHanssen/intervals_fasta

Fix --intervals false
nf-core · Jul 20, 2022 · 3720676 · 3720676
2 parents d1402dd + 7082782
commit 3720676
Show file tree

Hide file tree

Showing 8 changed files with 131 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -150,6 +150,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#642](https://github.com/nf-core/sarek/pull/642) - Only unzip ref files if tool is run, only publish ref files if `--save_reference` and simplify CNKit logic
 - [#650](https://github.com/nf-core/sarek/pull/650) - Fix intervals checks
 - [#654](https://github.com/nf-core/sarek/pull/654) - Allow any step but annotation to start from BAM files
+- [#655](https://github.com/nf-core/sarek/pull/655) - Fix `--intervals false` logic & add versioning for local modules
 - [#658](https://github.com/nf-core/sarek/pull/658) - Fix split fastq names in multiqc-report
 - [#666](https://github.com/nf-core/sarek/pull/666) - Simplify multiqc config channel input
 

diff --git a/modules/local/build_intervals/main.nf b/modules/local/build_intervals/main.nf
@@ -1,22 +1,28 @@
 process BUILD_INTERVALS {
-    tag "$fasta_fai"
+    tag "$meta.id"
 
     conda (params.enable_conda ? "anaconda::gawk=5.1.0" : null)
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
         'quay.io/biocontainers/gawk:5.1.0' }"
 
     input:
-    path fasta_fai
+    tuple val(meta), path(fasta_fai)
 
     output:
-    path "*.bed", emit: bed
+    tuple val(meta), path("${fasta_fai.baseName}.bed") , emit: bed
+    path "versions.yml"                                , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     """
     awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fasta_fai} > ${fasta_fai.baseName}.bed
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
+    END_VERSIONS
     """
 }
diff --git a/modules/local/create_intervals_bed/main.nf b/modules/local/create_intervals_bed/main.nf
@@ -7,11 +7,11 @@ process CREATE_INTERVALS_BED {
         'quay.io/biocontainers/gawk:5.1.0' }"
 
     input:
-    path intervals
+    path(intervals)
 
     output:
-    path ("*.bed"), emit: bed
-    //TODO version number missing
+    path("*.bed")       , emit: bed
+    path "versions.yml" , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -20,7 +20,7 @@ process CREATE_INTERVALS_BED {
     // If intervals file is in BED format,
     // Fifth column is interpreted to contain runtime estimates
     // Which is then used to combine short-running jobs
-    if (intervals.toString().toLowerCase().endsWith("bed"))
+    if (intervals.toString().toLowerCase().endsWith("bed")) {
         """
         awk -vFS="\t" '{
             t = \$5  # runtime estimate
@@ -39,19 +39,35 @@ process CREATE_INTERVALS_BED {
             chunk += t
             print \$0 > name
         }' ${intervals}
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
+        END_VERSIONS
         """
-    else if (intervals.toString().toLowerCase().endsWith("interval_list"))
+    } else if (intervals.toString().toLowerCase().endsWith("interval_list")) {
         """
         grep -v '^@' ${intervals} | awk -vFS="\t" '{
             name = sprintf("%s_%d-%d", \$1, \$2, \$3);
             printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed"
         }'
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
+        END_VERSIONS
         """
-    else
+    } else {
         """
         awk -vFS="[:-]" '{
             name = sprintf("%s_%d-%d", \$1, \$2, \$3);
             printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed"
         }' ${intervals}
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
+        END_VERSIONS
         """
+    }
 }
diff --git a/subworkflows/local/prepare_cnvkit_reference.nf b/subworkflows/local/prepare_cnvkit_reference.nf
@@ -0,0 +1,33 @@
+//
+// PREPARE CNVKIT REFERENCE
+//
+
+// Initialize channels based on params or indices that were just built
+// For all modules here:
+// A when clause condition is defined in the conf/modules.config to determine if the module should be run
+
+include { CNVKIT_ANTITARGET } from '../../modules/nf-core/modules/cnvkit/antitarget/main'
+include { CNVKIT_REFERENCE  } from '../../modules/nf-core/modules/cnvkit/reference/main'
+
+workflow PREPARE_CNVKIT_REFERENCE {
+    take:
+        fasta                    // channel: [mandatory] fasta
+        intervals_bed_combined   // channel: []
+
+    main:
+
+    ch_versions = Channel.empty()
+
+    // prepare a antitarget reference files for tumor_only mode of cnvkit
+    CNVKIT_ANTITARGET(intervals_bed_combined.flatten().map{ it -> [[id:it[0].baseName], it] })
+    CNVKIT_REFERENCE(fasta, intervals_bed_combined, CNVKIT_ANTITARGET.out.bed.map{ meta, bed -> [bed]} )
+
+    ch_versions = ch_versions.mix(CNVKIT_ANTITARGET.out.versions)
+    ch_versions = ch_versions.mix(CNVKIT_REFERENCE.out.versions)
+
+    emit:
+    versions            = ch_versions
+    cnvkit_reference    = CNVKIT_REFERENCE.out.cnn
+}
+
+
diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
@@ -10,8 +10,6 @@
 
 include { BWA_INDEX as BWAMEM1_INDEX             } from '../../modules/nf-core/modules/bwa/index/main'
 include { BWAMEM2_INDEX                          } from '../../modules/nf-core/modules/bwamem2/index/main'
-include {CNVKIT_ANTITARGET                       } from '../../modules/nf-core/modules/cnvkit/antitarget/main'
-include {CNVKIT_REFERENCE                        } from '../../modules/nf-core/modules/cnvkit/reference/main'
 include { DRAGMAP_HASHTABLE                      } from '../../modules/nf-core/modules/dragmap/hashtable/main'
 include { GATK4_CREATESEQUENCEDICTIONARY         } from '../../modules/nf-core/modules/gatk4/createsequencedictionary/main'
 include { MSISENSORPRO_SCAN                      } from '../../modules/nf-core/modules/msisensorpro/scan/main'
@@ -37,7 +35,6 @@ workflow PREPARE_GENOME {
         fasta                   // channel: [mandatory] fasta
         fasta_fai               // channel: [optional]  fasta_fai
         germline_resource       // channel: [optional]  germline_resource
-        intervals_bed_combined  // channel: []
         known_indels            // channel: [optional]  known_indels
         pon                     // channel: [optional]  pon
 
@@ -63,10 +60,6 @@ workflow PREPARE_GENOME {
     TABIX_KNOWN_INDELS( known_indels.flatten().map{ it -> [[id:it.baseName], it] } )
     TABIX_PON(pon.flatten().map{ it -> [[id:it.baseName], it] })
 
-    // prepare a reference for tumor_only mode based on target_baits
-    CNVKIT_ANTITARGET(intervals_bed_combined.flatten().map{ it -> [[id:it[0].baseName], it] })
-    CNVKIT_REFERENCE(fasta, intervals_bed_combined, CNVKIT_ANTITARGET.out.bed.map{ meta, bed -> [bed]} )
-
     // prepare ascat reference files
     allele_files = ascat_alleles
     if (params.ascat_alleles && params.ascat_alleles.endsWith('.zip')) {
@@ -106,8 +99,6 @@ workflow PREPARE_GENOME {
     ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions)
     ch_versions = ch_versions.mix(BWAMEM1_INDEX.out.versions)
     ch_versions = ch_versions.mix(BWAMEM2_INDEX.out.versions)
-    ch_versions = ch_versions.mix(CNVKIT_ANTITARGET.out.versions)
-    ch_versions = ch_versions.mix(CNVKIT_REFERENCE.out.versions)
     ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions)
     ch_versions = ch_versions.mix(MSISENSORPRO_SCAN.out.versions)
     ch_versions = ch_versions.mix(TABIX_DBSNP.out.versions)
@@ -127,7 +118,6 @@ workflow PREPARE_GENOME {
         msisensorpro_scan                = MSISENSORPRO_SCAN.out.list.map{ meta, list -> [list] }              // path: genome_msi.list
         pon_tbi                          = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect()               // path: pon.vcf.gz.tbi
         chr_files                        = chr_files
-        cnvkit_reference                 = CNVKIT_REFERENCE.out.cnn
         allele_files                     = allele_files
         loci_files                       = loci_files
         gc_file                          = gc_file

diff --git a/subworkflows/local/prepare_intervals.nf b/subworkflows/local/prepare_intervals.nf
@@ -6,10 +6,12 @@
 // For all modules here:
 // A when clause condition is defined in the conf/modules.config to determine if the module should be run
 
-include { BUILD_INTERVALS                                       } from '../../modules/local/build_intervals/main'
-include { CREATE_INTERVALS_BED                                  } from '../../modules/local/create_intervals_bed/main'
-include { GATK4_INTERVALLISTTOBED                               } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main'
-include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT   } from '../../modules/nf-core/modules/tabix/bgziptabix/main'
+include { BUILD_INTERVALS                                     } from '../../modules/local/build_intervals/main'
+include { CNVKIT_ANTITARGET                                   } from '../../modules/nf-core/modules/cnvkit/antitarget/main'
+include { CNVKIT_REFERENCE                                    } from '../../modules/nf-core/modules/cnvkit/reference/main'
+include { CREATE_INTERVALS_BED                                } from '../../modules/local/create_intervals_bed/main'
+include { GATK4_INTERVALLISTTOBED                             } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main'
+include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main'
 
 workflow PREPARE_INTERVALS {
     take:
@@ -42,15 +44,21 @@ workflow PREPARE_INTERVALS {
         //If no interval/target file is provided, then intervals are generated from FASTA file
         if (!params.intervals) {
 
-            BUILD_INTERVALS(fasta_fai)
-            ch_intervals_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.simpleName], it] }
+            BUILD_INTERVALS(fasta_fai.map{it -> [[id:it.baseName], it]})
 
-            ch_intervals = CREATE_INTERVALS_BED(ch_intervals_combined)
+            ch_intervals_combined = BUILD_INTERVALS.out.bed
+
+            ch_intervals = CREATE_INTERVALS_BED(ch_intervals_combined.map{meta, path -> path}).bed
+
+            ch_versions = ch_versions.mix(BUILD_INTERVALS.out.versions)
+            ch_versions = ch_versions.mix(CREATE_INTERVALS_BED.out.versions)
 
         } else {
 
             ch_intervals_combined = Channel.fromPath(file(params.intervals)).map{it -> [[id:it.baseName], it] }
-            ch_intervals = CREATE_INTERVALS_BED(file(params.intervals))
+
+            ch_intervals = CREATE_INTERVALS_BED(file(params.intervals)).bed
+            ch_versions = ch_versions.mix(CREATE_INTERVALS_BED.out.versions)
 
             //If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format
             if(!params.intervals.endsWith(".bed")) {
@@ -98,8 +106,8 @@ workflow PREPARE_INTERVALS {
     }
 
     emit:
-        intervals_bed                    = ch_intervals                                           // path: intervals.bed, num_intervals                        [intervals split for parallel execution]
-        intervals_bed_gz_tbi             = ch_intervals_bed_gz_tbi                                // path: target.bed.gz, target.bed.gz.tbi, num_intervals     [intervals split for parallel execution]
-        intervals_bed_combined           = ch_intervals_combined.map{meta, bed -> bed }.collect() // path: intervals.bed                        [all intervals in one file]
-        versions                         = ch_versions                                            // channel: [ versions.yml ]
+        intervals_bed               = ch_intervals                                           // path: intervals.bed, num_intervals                        [intervals split for parallel execution]
+        intervals_bed_gz_tbi        = ch_intervals_bed_gz_tbi                                // path: target.bed.gz, target.bed.gz.tbi, num_intervals     [intervals split for parallel execution]
+        intervals_bed_combined      = ch_intervals_combined.map{meta, bed -> bed }.collect() // path: intervals.bed                        [all intervals in one file]
+        versions                    = ch_versions                                            // channel: [ versions.yml ]
 }
diff --git a/tests/test_targeted.yml b/tests/test_targeted.yml
@@ -31,3 +31,31 @@
     - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi
     - path: results/reports/samtools/test/test.md.cram.stats
     - path: results/reports/samtools/test/test.recal.cram.stats
+
+- name: Run intervals false pipeline
+  command: nextflow run main.nf -profile test,docker --intervals false
+  tags:
+    - default
+    - preprocessing
+  files:
+    - path: results/csv/markduplicates.csv
+    - path: results/csv/markduplicates_no_table.csv
+    - path: results/csv/recalibrated.csv
+    - path: results/multiqc
+    - path: results/preprocessing/markduplicates/test/test.md.cram
+    - path: results/preprocessing/markduplicates/test/test.md.cram.crai
+    - path: results/preprocessing/recal_table/test/test.recal.table
+    - path: results/preprocessing/recalibrated/test/test.recal.cram
+    - path: results/preprocessing/recalibrated/test/test.recal.cram.crai
+    - path: results/reports/fastqc/test-test_L1
+    - path: results/reports/markduplicates/test/test.md.metrics
+    - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt
+    - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt
+    - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt
+    - path: results/reports/mosdepth/test/test.md.regions.bed.gz
+    - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt
+    - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt
+    - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt
+    - path: results/reports/mosdepth/test/test.recal.regions.bed.gz
+    - path: results/reports/samtools/test/test.md.cram.stats
+    - path: results/reports/samtools/test/test.recal.cram.stats
diff --git a/workflows/sarek.nf b/workflows/sarek.nf
@@ -201,6 +201,9 @@ include { PREPARE_GENOME                                       } from '../subwor
 // Build intervals if needed
 include { PREPARE_INTERVALS                                    } from '../subworkflows/local/prepare_intervals'
 
+// Build CNVkit reference if needed
+include { PREPARE_CNVKIT_REFERENCE                             } from '../subworkflows/local/prepare_cnvkit_reference'
+
 // Convert BAM files to FASTQ files
 include { ALIGNMENT_TO_FASTQ as ALIGNMENT_TO_FASTQ_INPUT       } from '../subworkflows/nf-core/alignment_to_fastq'
 include { ALIGNMENT_TO_FASTQ as ALIGNMENT_TO_FASTQ_UMI         } from '../subworkflows/nf-core/alignment_to_fastq'
@@ -296,15 +299,6 @@ workflow SAREK {
     // To gather used softwares versions for MultiQC
     ch_versions = Channel.empty()
 
-    // Build intervals if needed
-    PREPARE_INTERVALS(fasta_fai)
-
-    // Intervals for speed up preprocessing/variant calling by spread/gather
-    intervals_bed_combined      = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_combined  // [interval.bed] all intervals in one file
-    intervals_for_preprocessing = params.wes ? intervals_bed_combined : []      // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS)
-
-    intervals                   = PREPARE_INTERVALS.out.intervals_bed        // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather
-    intervals_bed_gz_tbi        = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather
 
     // Build indices if needed
     PREPARE_GENOME(
@@ -317,7 +311,6 @@ workflow SAREK {
         fasta,
         fasta_fai,
         germline_resource,
-        intervals_bed_combined,
         known_indels,
         pon)
 
@@ -326,7 +319,6 @@ workflow SAREK {
     bwa                    = params.fasta                   ? params.bwa                        ? Channel.fromPath(params.bwa).collect()                   : PREPARE_GENOME.out.bwa                   : []
     bwamem2                = params.fasta                   ? params.bwamem2                    ? Channel.fromPath(params.bwamem2).collect()               : PREPARE_GENOME.out.bwamem2               : []
     chr_files              = PREPARE_GENOME.out.chr_files
-    cnvkit_reference       = params.tools && params.tools.split(',').contains('cnvkit')         ? PREPARE_GENOME.out.cnvkit_reference                      : Channel.empty()
     dragmap                = params.fasta                   ? params.dragmap                    ? Channel.fromPath(params.dragmap).collect()               : PREPARE_GENOME.out.hashtable             : []
     dict                   = params.fasta                   ? params.dict                       ? Channel.fromPath(params.dict).collect()                  : PREPARE_GENOME.out.dict                  : []
     fasta_fai              = params.fasta                   ? params.fasta_fai                  ? Channel.fromPath(params.fasta_fai).collect()             : PREPARE_GENOME.out.fasta_fai             : []
@@ -349,10 +341,25 @@ workflow SAREK {
     known_sites     = dbsnp.concat(known_indels).collect()
     known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect()
 
+     // Build intervals if needed
+    PREPARE_INTERVALS(fasta_fai)
+
+    // Intervals for speed up preprocessing/variant calling by spread/gather
+    intervals_bed_combined      = params.no_intervals ? Channel.value([])      : PREPARE_INTERVALS.out.intervals_bed_combined  // [interval.bed] all intervals in one file
+    intervals_for_preprocessing = params.wes          ? intervals_bed_combined : []      // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS)
+
+    intervals                   = PREPARE_INTERVALS.out.intervals_bed        // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather
+    intervals_bed_gz_tbi        = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather
+
     // Gather used softwares versions
     ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions)
     ch_versions = ch_versions.mix(PREPARE_INTERVALS.out.versions)
 
+     // Antitarget based reference for CNVKit
+    PREPARE_CNVKIT_REFERENCE(fasta, intervals_bed_combined)
+    cnvkit_reference            = params.tools && params.tools.split(',').contains('cnvkit') ? PREPARE_CNVKIT_REFERENCE.out.cnvkit_reference : Channel.empty()
+    ch_versions = ch_versions.mix(PREPARE_CNVKIT_REFERENCE.out.versions)
+
     // PREPROCESSING
 
     if (params.step == 'mapping') {