diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c69d7bb2..ecf9494a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Updated pipeline template to nf-core/tools `1.12` * [[#500](https://github.com/nf-core/rnaseq/issues/500), [#509](https://github.com/nf-core/rnaseq/issues/509)] - Error with AWS batch params +* [[#511](https://github.com/nf-core/rnaseq/issues/511)] - rsem/star index fails with large genome ## [[2.0](https://github.com/nf-core/rnaseq/releases/tag/2.0)] - 2020-11-12 diff --git a/README.md b/README.md index 4e2f5270c..2d068e333 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,8 @@ On release, automated continuous integration tests run the pipeline on a [full-s -profile ``` + > **NB:** The commands to obtain public data and to run the main arm of the pipeline are completely independent. This is intentional because it allows you to download all of the raw data in an initial pipeline run (`results/public_data/`) and then to curate the auto-created samplesheet based on the available sample metadata before you run the pipeline again properly. + See [usage docs](https://nf-co.re/rnaseq/usage) for all of the available options when running the pipeline. ## Documentation diff --git a/bin/deseq2_qc.r b/bin/deseq2_qc.r index 556029fb0..e543110da 100755 --- a/bin/deseq2_qc.r +++ b/bin/deseq2_qc.r @@ -54,11 +54,11 @@ if (is.null(opt$count_file)){ ################################################ ################################################ -count.table <- read.delim(file=opt$count_file,header=TRUE) -rownames(count.table) <- count.table[,opt$id_col] -count.table <- count.table[,opt$count_col:ncol(count.table),drop=FALSE] -colnames(count.table) <- gsub(opt$sample_suffix,"",colnames(count.table)) -colnames(count.table) <- as.character(lapply(colnames(count.table), function (x) tail(strsplit(x,'.',fixed=TRUE)[[1]],1))) +count.table <- read.delim(file=opt$count_file,header=TRUE) +rownames(count.table) <- count.table[,opt$id_col] +count.table <- count.table[,opt$count_col:ncol(count.table),drop=FALSE] +colnames(count.table) <- gsub(opt$sample_suffix,"",colnames(count.table)) +colnames(count.table) <- gsub(pattern='\\.$', replacement='', colnames(count.table)) ################################################ ################################################ diff --git a/conf/modules.config b/conf/modules.config index 1186daf97..647dc5e43 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -146,7 +146,7 @@ params { publish_dir = "${params.aligner}/bigwig" } 'deseq2_qc' { - args = "--id_col 1 --sample_suffix '' --outdir ./ --outprefix deseq2" + args = "--id_col 1 --sample_suffix '' --outprefix deseq2" publish_files = ['RData':'', 'pca.vals.txt':'', 'plots.pdf':'', 'sample.dists.txt':'', 'size_factors':'', 'log':''] publish_dir = "${params.aligner}/deseq2_qc" } diff --git a/conf/test.config b/conf/test.config index ef8b5718b..35814a298 100644 --- a/conf/test.config +++ b/conf/test.config @@ -8,7 +8,7 @@ */ params { - config_profile_name = 'Test profile' + config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run CI @@ -34,4 +34,13 @@ params { // Other parameters pseudo_aligner = 'salmon' umitools_bc_pattern = 'NNNN' + + // When using RSEM, remove warning from STAR whilst building tiny indices + modules { + 'rsem_preparereference' { + args2 = "--genomeSAindexNbases 7" + } + } } + + diff --git a/conf/test_full.config b/conf/test_full.config index 7db163b5a..60fd245d9 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -8,14 +8,14 @@ */ params { - config_profile_name = 'Full test profile' + config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/samplesheet_full.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/samplesheet_full.csv' - // Genome references - genome = 'GRCh37' + // Genome reference + genome = 'GRCh37' // Other parameters pseudo_aligner = 'salmon' diff --git a/conf/test_sra.config b/conf/test_sra.config index ad9c7fe6b..71ff755af 100644 --- a/conf/test_sra.config +++ b/conf/test_sra.config @@ -8,7 +8,7 @@ */ params { - config_profile_name = 'Public data download test profile' + config_profile_name = 'Public data download test profile' config_profile_description = 'Minimal test dataset to check pipeline function when downloading data via the ENA' // Limit resources so that this can run CI diff --git a/modules/nf-core/software/rsem/preparereference/main.nf b/modules/nf-core/software/rsem/preparereference/main.nf index 570eac76b..982f6cdda 100644 --- a/modules/nf-core/software/rsem/preparereference/main.nf +++ b/modules/nf-core/software/rsem/preparereference/main.nf @@ -24,15 +24,41 @@ process RSEM_PREPAREREFERENCE { script: def software = getSoftwareName(task.process) - """ - mkdir rsem - rsem-prepare-reference \\ - --gtf $gtf \\ - --num-threads $task.cpus \\ - $options.args \\ - $fasta \\ - rsem/genome + def args = options.args.tokenize() + if (args.contains('--star')) { + args.removeIf { it.contains('--star') } + def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + """ + mkdir rsem + STAR \\ + --runMode genomeGenerate \\ + --genomeDir rsem/ \\ + --genomeFastaFiles $fasta \\ + --sjdbGTFfile $gtf \\ + --runThreadN $task.cpus \\ + $memory \\ + $options.args2 + + rsem-prepare-reference \\ + --gtf $gtf \\ + --num-threads $task.cpus \\ + ${args.join(' ')} \\ + $fasta \\ + rsem/genome - rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g" > ${software}.version.txt - """ + rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g" > ${software}.version.txt + """ + } else { + """ + mkdir rsem + rsem-prepare-reference \\ + --gtf $gtf \\ + --num-threads $task.cpus \\ + $options.args \\ + $fasta \\ + rsem/genome + + rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g" > ${software}.version.txt + """ + } } diff --git a/rnaseq.nf b/rnaseq.nf index b0695e667..6121702e3 100755 --- a/rnaseq.nf +++ b/rnaseq.nf @@ -299,9 +299,9 @@ workflow RNASEQ { ch_input ) .map { - meta, bam -> + meta, fastq -> meta.id = meta.id.split('_')[0..-2].join('_') - [ meta, bam ] } + [ meta, fastq ] } .groupTuple(by: [0]) .map { it -> [ it[0], it[1].flatten() ] } .set { ch_cat_fastq }