Skip to content

Commit

Permalink
Merge pull request #655 from FriederikeHanssen/intervals_fasta
Browse files Browse the repository at this point in the history
Fix --intervals false
  • Loading branch information
maxulysse authored Jul 20, 2022
2 parents d1402dd + 7082782 commit 3720676
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 42 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#642](https://github.com/nf-core/sarek/pull/642) - Only unzip ref files if tool is run, only publish ref files if `--save_reference` and simplify CNKit logic
- [#650](https://github.com/nf-core/sarek/pull/650) - Fix intervals checks
- [#654](https://github.com/nf-core/sarek/pull/654) - Allow any step but annotation to start from BAM files
- [#655](https://github.com/nf-core/sarek/pull/655) - Fix `--intervals false` logic & add versioning for local modules
- [#658](https://github.com/nf-core/sarek/pull/658) - Fix split fastq names in multiqc-report
- [#666](https://github.com/nf-core/sarek/pull/666) - Simplify multiqc config channel input

Expand Down
12 changes: 9 additions & 3 deletions modules/local/build_intervals/main.nf
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
process BUILD_INTERVALS {
tag "$fasta_fai"
tag "$meta.id"

conda (params.enable_conda ? "anaconda::gawk=5.1.0" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
'quay.io/biocontainers/gawk:5.1.0' }"

input:
path fasta_fai
tuple val(meta), path(fasta_fai)

output:
path "*.bed", emit: bed
tuple val(meta), path("${fasta_fai.baseName}.bed") , emit: bed
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
"""
awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fasta_fai} > ${fasta_fai.baseName}.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
END_VERSIONS
"""
}
28 changes: 22 additions & 6 deletions modules/local/create_intervals_bed/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ process CREATE_INTERVALS_BED {
'quay.io/biocontainers/gawk:5.1.0' }"

input:
path intervals
path(intervals)

output:
path ("*.bed"), emit: bed
//TODO version number missing
path("*.bed") , emit: bed
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when
Expand All @@ -20,7 +20,7 @@ process CREATE_INTERVALS_BED {
// If intervals file is in BED format,
// Fifth column is interpreted to contain runtime estimates
// Which is then used to combine short-running jobs
if (intervals.toString().toLowerCase().endsWith("bed"))
if (intervals.toString().toLowerCase().endsWith("bed")) {
"""
awk -vFS="\t" '{
t = \$5 # runtime estimate
Expand All @@ -39,19 +39,35 @@ process CREATE_INTERVALS_BED {
chunk += t
print \$0 > name
}' ${intervals}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
END_VERSIONS
"""
else if (intervals.toString().toLowerCase().endsWith("interval_list"))
} else if (intervals.toString().toLowerCase().endsWith("interval_list")) {
"""
grep -v '^@' ${intervals} | awk -vFS="\t" '{
name = sprintf("%s_%d-%d", \$1, \$2, \$3);
printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed"
}'
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
END_VERSIONS
"""
else
} else {
"""
awk -vFS="[:-]" '{
name = sprintf("%s_%d-%d", \$1, \$2, \$3);
printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed"
}' ${intervals}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
END_VERSIONS
"""
}
}
33 changes: 33 additions & 0 deletions subworkflows/local/prepare_cnvkit_reference.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//
// PREPARE CNVKIT REFERENCE
//

// Initialize channels based on params or indices that were just built
// For all modules here:
// A when clause condition is defined in the conf/modules.config to determine if the module should be run

include { CNVKIT_ANTITARGET } from '../../modules/nf-core/modules/cnvkit/antitarget/main'
include { CNVKIT_REFERENCE } from '../../modules/nf-core/modules/cnvkit/reference/main'

workflow PREPARE_CNVKIT_REFERENCE {
take:
fasta // channel: [mandatory] fasta
intervals_bed_combined // channel: []

main:

ch_versions = Channel.empty()

// prepare a antitarget reference files for tumor_only mode of cnvkit
CNVKIT_ANTITARGET(intervals_bed_combined.flatten().map{ it -> [[id:it[0].baseName], it] })
CNVKIT_REFERENCE(fasta, intervals_bed_combined, CNVKIT_ANTITARGET.out.bed.map{ meta, bed -> [bed]} )

ch_versions = ch_versions.mix(CNVKIT_ANTITARGET.out.versions)
ch_versions = ch_versions.mix(CNVKIT_REFERENCE.out.versions)

emit:
versions = ch_versions
cnvkit_reference = CNVKIT_REFERENCE.out.cnn
}


10 changes: 0 additions & 10 deletions subworkflows/local/prepare_genome.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@

include { BWA_INDEX as BWAMEM1_INDEX } from '../../modules/nf-core/modules/bwa/index/main'
include { BWAMEM2_INDEX } from '../../modules/nf-core/modules/bwamem2/index/main'
include {CNVKIT_ANTITARGET } from '../../modules/nf-core/modules/cnvkit/antitarget/main'
include {CNVKIT_REFERENCE } from '../../modules/nf-core/modules/cnvkit/reference/main'
include { DRAGMAP_HASHTABLE } from '../../modules/nf-core/modules/dragmap/hashtable/main'
include { GATK4_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/modules/gatk4/createsequencedictionary/main'
include { MSISENSORPRO_SCAN } from '../../modules/nf-core/modules/msisensorpro/scan/main'
Expand All @@ -37,7 +35,6 @@ workflow PREPARE_GENOME {
fasta // channel: [mandatory] fasta
fasta_fai // channel: [optional] fasta_fai
germline_resource // channel: [optional] germline_resource
intervals_bed_combined // channel: []
known_indels // channel: [optional] known_indels
pon // channel: [optional] pon

Expand All @@ -63,10 +60,6 @@ workflow PREPARE_GENOME {
TABIX_KNOWN_INDELS( known_indels.flatten().map{ it -> [[id:it.baseName], it] } )
TABIX_PON(pon.flatten().map{ it -> [[id:it.baseName], it] })

// prepare a reference for tumor_only mode based on target_baits
CNVKIT_ANTITARGET(intervals_bed_combined.flatten().map{ it -> [[id:it[0].baseName], it] })
CNVKIT_REFERENCE(fasta, intervals_bed_combined, CNVKIT_ANTITARGET.out.bed.map{ meta, bed -> [bed]} )

// prepare ascat reference files
allele_files = ascat_alleles
if (params.ascat_alleles && params.ascat_alleles.endsWith('.zip')) {
Expand Down Expand Up @@ -106,8 +99,6 @@ workflow PREPARE_GENOME {
ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions)
ch_versions = ch_versions.mix(BWAMEM1_INDEX.out.versions)
ch_versions = ch_versions.mix(BWAMEM2_INDEX.out.versions)
ch_versions = ch_versions.mix(CNVKIT_ANTITARGET.out.versions)
ch_versions = ch_versions.mix(CNVKIT_REFERENCE.out.versions)
ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions)
ch_versions = ch_versions.mix(MSISENSORPRO_SCAN.out.versions)
ch_versions = ch_versions.mix(TABIX_DBSNP.out.versions)
Expand All @@ -127,7 +118,6 @@ workflow PREPARE_GENOME {
msisensorpro_scan = MSISENSORPRO_SCAN.out.list.map{ meta, list -> [list] } // path: genome_msi.list
pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi
chr_files = chr_files
cnvkit_reference = CNVKIT_REFERENCE.out.cnn
allele_files = allele_files
loci_files = loci_files
gc_file = gc_file
Expand Down
32 changes: 20 additions & 12 deletions subworkflows/local/prepare_intervals.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
// For all modules here:
// A when clause condition is defined in the conf/modules.config to determine if the module should be run

include { BUILD_INTERVALS } from '../../modules/local/build_intervals/main'
include { CREATE_INTERVALS_BED } from '../../modules/local/create_intervals_bed/main'
include { GATK4_INTERVALLISTTOBED } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main'
include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main'
include { BUILD_INTERVALS } from '../../modules/local/build_intervals/main'
include { CNVKIT_ANTITARGET } from '../../modules/nf-core/modules/cnvkit/antitarget/main'
include { CNVKIT_REFERENCE } from '../../modules/nf-core/modules/cnvkit/reference/main'
include { CREATE_INTERVALS_BED } from '../../modules/local/create_intervals_bed/main'
include { GATK4_INTERVALLISTTOBED } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main'
include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main'

workflow PREPARE_INTERVALS {
take:
Expand Down Expand Up @@ -42,15 +44,21 @@ workflow PREPARE_INTERVALS {
//If no interval/target file is provided, then intervals are generated from FASTA file
if (!params.intervals) {

BUILD_INTERVALS(fasta_fai)
ch_intervals_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.simpleName], it] }
BUILD_INTERVALS(fasta_fai.map{it -> [[id:it.baseName], it]})

ch_intervals = CREATE_INTERVALS_BED(ch_intervals_combined)
ch_intervals_combined = BUILD_INTERVALS.out.bed

ch_intervals = CREATE_INTERVALS_BED(ch_intervals_combined.map{meta, path -> path}).bed

ch_versions = ch_versions.mix(BUILD_INTERVALS.out.versions)
ch_versions = ch_versions.mix(CREATE_INTERVALS_BED.out.versions)

} else {

ch_intervals_combined = Channel.fromPath(file(params.intervals)).map{it -> [[id:it.baseName], it] }
ch_intervals = CREATE_INTERVALS_BED(file(params.intervals))

ch_intervals = CREATE_INTERVALS_BED(file(params.intervals)).bed
ch_versions = ch_versions.mix(CREATE_INTERVALS_BED.out.versions)

//If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format
if(!params.intervals.endsWith(".bed")) {
Expand Down Expand Up @@ -98,8 +106,8 @@ workflow PREPARE_INTERVALS {
}

emit:
intervals_bed = ch_intervals // path: intervals.bed, num_intervals [intervals split for parallel execution]
intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi, num_intervals [intervals split for parallel execution]
intervals_bed_combined = ch_intervals_combined.map{meta, bed -> bed }.collect() // path: intervals.bed [all intervals in one file]
versions = ch_versions // channel: [ versions.yml ]
intervals_bed = ch_intervals // path: intervals.bed, num_intervals [intervals split for parallel execution]
intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi, num_intervals [intervals split for parallel execution]
intervals_bed_combined = ch_intervals_combined.map{meta, bed -> bed }.collect() // path: intervals.bed [all intervals in one file]
versions = ch_versions // channel: [ versions.yml ]
}
28 changes: 28 additions & 0 deletions tests/test_targeted.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,31 @@
- path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi
- path: results/reports/samtools/test/test.md.cram.stats
- path: results/reports/samtools/test/test.recal.cram.stats

- name: Run intervals false pipeline
command: nextflow run main.nf -profile test,docker --intervals false
tags:
- default
- preprocessing
files:
- path: results/csv/markduplicates.csv
- path: results/csv/markduplicates_no_table.csv
- path: results/csv/recalibrated.csv
- path: results/multiqc
- path: results/preprocessing/markduplicates/test/test.md.cram
- path: results/preprocessing/markduplicates/test/test.md.cram.crai
- path: results/preprocessing/recal_table/test/test.recal.table
- path: results/preprocessing/recalibrated/test/test.recal.cram
- path: results/preprocessing/recalibrated/test/test.recal.cram.crai
- path: results/reports/fastqc/test-test_L1
- path: results/reports/markduplicates/test/test.md.metrics
- path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt
- path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt
- path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt
- path: results/reports/mosdepth/test/test.md.regions.bed.gz
- path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt
- path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt
- path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt
- path: results/reports/mosdepth/test/test.recal.regions.bed.gz
- path: results/reports/samtools/test/test.md.cram.stats
- path: results/reports/samtools/test/test.recal.cram.stats
29 changes: 18 additions & 11 deletions workflows/sarek.nf
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ include { PREPARE_GENOME } from '../subwor
// Build intervals if needed
include { PREPARE_INTERVALS } from '../subworkflows/local/prepare_intervals'

// Build CNVkit reference if needed
include { PREPARE_CNVKIT_REFERENCE } from '../subworkflows/local/prepare_cnvkit_reference'

// Convert BAM files to FASTQ files
include { ALIGNMENT_TO_FASTQ as ALIGNMENT_TO_FASTQ_INPUT } from '../subworkflows/nf-core/alignment_to_fastq'
include { ALIGNMENT_TO_FASTQ as ALIGNMENT_TO_FASTQ_UMI } from '../subworkflows/nf-core/alignment_to_fastq'
Expand Down Expand Up @@ -296,15 +299,6 @@ workflow SAREK {
// To gather used softwares versions for MultiQC
ch_versions = Channel.empty()

// Build intervals if needed
PREPARE_INTERVALS(fasta_fai)

// Intervals for speed up preprocessing/variant calling by spread/gather
intervals_bed_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_combined // [interval.bed] all intervals in one file
intervals_for_preprocessing = params.wes ? intervals_bed_combined : [] // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS)

intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather
intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather

// Build indices if needed
PREPARE_GENOME(
Expand All @@ -317,7 +311,6 @@ workflow SAREK {
fasta,
fasta_fai,
germline_resource,
intervals_bed_combined,
known_indels,
pon)

Expand All @@ -326,7 +319,6 @@ workflow SAREK {
bwa = params.fasta ? params.bwa ? Channel.fromPath(params.bwa).collect() : PREPARE_GENOME.out.bwa : []
bwamem2 = params.fasta ? params.bwamem2 ? Channel.fromPath(params.bwamem2).collect() : PREPARE_GENOME.out.bwamem2 : []
chr_files = PREPARE_GENOME.out.chr_files
cnvkit_reference = params.tools && params.tools.split(',').contains('cnvkit') ? PREPARE_GENOME.out.cnvkit_reference : Channel.empty()
dragmap = params.fasta ? params.dragmap ? Channel.fromPath(params.dragmap).collect() : PREPARE_GENOME.out.hashtable : []
dict = params.fasta ? params.dict ? Channel.fromPath(params.dict).collect() : PREPARE_GENOME.out.dict : []
fasta_fai = params.fasta ? params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : PREPARE_GENOME.out.fasta_fai : []
Expand All @@ -349,10 +341,25 @@ workflow SAREK {
known_sites = dbsnp.concat(known_indels).collect()
known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect()

// Build intervals if needed
PREPARE_INTERVALS(fasta_fai)

// Intervals for speed up preprocessing/variant calling by spread/gather
intervals_bed_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_combined // [interval.bed] all intervals in one file
intervals_for_preprocessing = params.wes ? intervals_bed_combined : [] // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS)

intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather
intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather

// Gather used softwares versions
ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions)
ch_versions = ch_versions.mix(PREPARE_INTERVALS.out.versions)

// Antitarget based reference for CNVKit
PREPARE_CNVKIT_REFERENCE(fasta, intervals_bed_combined)
cnvkit_reference = params.tools && params.tools.split(',').contains('cnvkit') ? PREPARE_CNVKIT_REFERENCE.out.cnvkit_reference : Channel.empty()
ch_versions = ch_versions.mix(PREPARE_CNVKIT_REFERENCE.out.versions)

// PREPROCESSING

if (params.step == 'mapping') {
Expand Down

0 comments on commit 3720676

Please sign in to comment.