From 56496c0b19a1ef82eb4fe5b66d417293aa15266e Mon Sep 17 00:00:00 2001 From: drpatelh Date: Mon, 30 Nov 2020 11:26:02 +0000 Subject: [PATCH 01/11] Add STAR parameter when building small indices --- conf/test.config | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index ef8b5718b..35814a298 100644 --- a/conf/test.config +++ b/conf/test.config @@ -8,7 +8,7 @@ */ params { - config_profile_name = 'Test profile' + config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run CI @@ -34,4 +34,13 @@ params { // Other parameters pseudo_aligner = 'salmon' umitools_bc_pattern = 'NNNN' + + // When using RSEM, remove warning from STAR whilst building tiny indices + modules { + 'rsem_preparereference' { + args2 = "--genomeSAindexNbases 7" + } + } } + + From 38640e26e35daece0676fbe8694f83cddedde318 Mon Sep 17 00:00:00 2001 From: drpatelh Date: Mon, 30 Nov 2020 11:26:14 +0000 Subject: [PATCH 02/11] Adjust spacing --- conf/test_full.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/test_full.config b/conf/test_full.config index 7db163b5a..60fd245d9 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -8,14 +8,14 @@ */ params { - config_profile_name = 'Full test profile' + config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/samplesheet_full.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/samplesheet_full.csv' - // Genome references - genome = 'GRCh37' + // Genome reference + genome = 'GRCh37' // Other parameters pseudo_aligner = 'salmon' From d2cf07659841d541bb48c8aaad617795cec2b25b Mon Sep 17 00:00:00 2001 From: drpatelh Date: Mon, 30 Nov 2020 11:26:20 +0000 Subject: [PATCH 03/11] Adjust spacing --- conf/test_sra.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test_sra.config b/conf/test_sra.config index ad9c7fe6b..71ff755af 100644 --- a/conf/test_sra.config +++ b/conf/test_sra.config @@ -8,7 +8,7 @@ */ params { - config_profile_name = 'Public data download test profile' + config_profile_name = 'Public data download test profile' config_profile_description = 'Minimal test dataset to check pipeline function when downloading data via the ENA' // Limit resources so that this can run CI From ad87ea7479f737765cbeddfcf568d2dfb4777116 Mon Sep 17 00:00:00 2001 From: drpatelh Date: Mon, 30 Nov 2020 11:26:46 +0000 Subject: [PATCH 04/11] Use STAR to build initial index for RSEM --- .../software/rsem/preparereference/main.nf | 48 ++++++++++++++----- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/modules/nf-core/software/rsem/preparereference/main.nf b/modules/nf-core/software/rsem/preparereference/main.nf index 570eac76b..271011438 100644 --- a/modules/nf-core/software/rsem/preparereference/main.nf +++ b/modules/nf-core/software/rsem/preparereference/main.nf @@ -24,15 +24,41 @@ process RSEM_PREPAREREFERENCE { script: def software = getSoftwareName(task.process) - """ - mkdir rsem - rsem-prepare-reference \\ - --gtf $gtf \\ - --num-threads $task.cpus \\ - $options.args \\ - $fasta \\ - rsem/genome + def args = options.args.tokenize() + if (args.contains('--star')) { + args.removeIf { it.contains('--star') } + def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + """ + mkdir rsem + STAR \\ + --runMode genomeGenerate \\ + --genomeDir rsem/ \\ + --genomeFastaFiles $fasta \\ + --sjdbGTFfile $gtf \\ + --runThreadN $task.cpus \\ + $memory \\ + $options.args2 + + rsem-prepare-reference \\ + --gtf $gtf \\ + --num-threads $task.cpus \\ + ${args.join(' ')} \\ + $fasta \\ + rsem/genome - rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g" > ${software}.version.txt - """ -} + rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g" > ${software}.version.txt + """ + } else { + """ + mkdir rsem + rsem-prepare-reference \\ + --gtf $gtf \\ + --num-threads $task.cpus \\ + $options.args \\ + $fasta \\ + rsem/genome + + rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g" > ${software}.version.txt + """ + } +} \ No newline at end of file From 0e483db9a84669d5f9689f03ae58acd6238cf856 Mon Sep 17 00:00:00 2001 From: drpatelh Date: Mon, 30 Nov 2020 11:40:32 +0000 Subject: [PATCH 05/11] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c69d7bb2..ecf9494a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Updated pipeline template to nf-core/tools `1.12` * [[#500](https://github.com/nf-core/rnaseq/issues/500), [#509](https://github.com/nf-core/rnaseq/issues/509)] - Error with AWS batch params +* [[#511](https://github.com/nf-core/rnaseq/issues/511)] - rsem/star index fails with large genome ## [[2.0](https://github.com/nf-core/rnaseq/releases/tag/2.0)] - 2020-11-12 From e2afdfac2300a75ca79319bf66204b1ff0e7a8c0 Mon Sep 17 00:00:00 2001 From: drpatelh Date: Mon, 30 Nov 2020 14:48:46 +0000 Subject: [PATCH 06/11] Install via nf-core modules --- modules/nf-core/software/rsem/preparereference/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/software/rsem/preparereference/main.nf b/modules/nf-core/software/rsem/preparereference/main.nf index 271011438..982f6cdda 100644 --- a/modules/nf-core/software/rsem/preparereference/main.nf +++ b/modules/nf-core/software/rsem/preparereference/main.nf @@ -61,4 +61,4 @@ process RSEM_PREPAREREFERENCE { rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g" > ${software}.version.txt """ } -} \ No newline at end of file +} From 91a0d1065ec93de44058d078f37658247f14e84d Mon Sep 17 00:00:00 2001 From: drpatelh Date: Mon, 30 Nov 2020 14:51:08 +0000 Subject: [PATCH 07/11] fastq not bam --- rnaseq.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rnaseq.nf b/rnaseq.nf index b0695e667..6121702e3 100755 --- a/rnaseq.nf +++ b/rnaseq.nf @@ -299,9 +299,9 @@ workflow RNASEQ { ch_input ) .map { - meta, bam -> + meta, fastq -> meta.id = meta.id.split('_')[0..-2].join('_') - [ meta, bam ] } + [ meta, fastq ] } .groupTuple(by: [0]) .map { it -> [ it[0], it[1].flatten() ] } .set { ch_cat_fastq } From edc5df7d4fa096b7304648af8a00b2cd6ad5df24 Mon Sep 17 00:00:00 2001 From: drpatelh Date: Tue, 1 Dec 2020 11:12:22 +0000 Subject: [PATCH 08/11] Allow for dots in sample names @ChristopherBarrington --- bin/deseq2_qc.r | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/deseq2_qc.r b/bin/deseq2_qc.r index 556029fb0..e543110da 100755 --- a/bin/deseq2_qc.r +++ b/bin/deseq2_qc.r @@ -54,11 +54,11 @@ if (is.null(opt$count_file)){ ################################################ ################################################ -count.table <- read.delim(file=opt$count_file,header=TRUE) -rownames(count.table) <- count.table[,opt$id_col] -count.table <- count.table[,opt$count_col:ncol(count.table),drop=FALSE] -colnames(count.table) <- gsub(opt$sample_suffix,"",colnames(count.table)) -colnames(count.table) <- as.character(lapply(colnames(count.table), function (x) tail(strsplit(x,'.',fixed=TRUE)[[1]],1))) +count.table <- read.delim(file=opt$count_file,header=TRUE) +rownames(count.table) <- count.table[,opt$id_col] +count.table <- count.table[,opt$count_col:ncol(count.table),drop=FALSE] +colnames(count.table) <- gsub(opt$sample_suffix,"",colnames(count.table)) +colnames(count.table) <- gsub(pattern='\\.$', replacement='', colnames(count.table)) ################################################ ################################################ From 01d0fc6b6df44a25bdac4d61dcb7bfc471f68ef2 Mon Sep 17 00:00:00 2001 From: drpatelh Date: Tue, 1 Dec 2020 11:12:38 +0000 Subject: [PATCH 09/11] Remove specifying --outdir twice --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 1186daf97..647dc5e43 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -146,7 +146,7 @@ params { publish_dir = "${params.aligner}/bigwig" } 'deseq2_qc' { - args = "--id_col 1 --sample_suffix '' --outdir ./ --outprefix deseq2" + args = "--id_col 1 --sample_suffix '' --outprefix deseq2" publish_files = ['RData':'', 'pca.vals.txt':'', 'plots.pdf':'', 'sample.dists.txt':'', 'size_factors':'', 'log':''] publish_dir = "${params.aligner}/deseq2_qc" } From 76bea0cc772fdf618973b6c7ca156f31beb4ca86 Mon Sep 17 00:00:00 2001 From: drpatelh Date: Tue, 1 Dec 2020 15:45:24 +0000 Subject: [PATCH 10/11] Add a note about running separate arms of pipeline --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 4e2f5270c..c4f77f0b5 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,8 @@ On release, automated continuous integration tests run the pipeline on a [full-s -profile ``` +> **NB:** The commands to obtain public data and to run the main arm of the pipeline are completely independent. This is intentional because it allows you to download all of the raw data in an initial pipeline run (`results/public_data/`) and then to curate the auto-created samplesheet based on the available sample metadata before you run the pipeline again properly. + See [usage docs](https://nf-co.re/rnaseq/usage) for all of the available options when running the pipeline. ## Documentation From 641dfbc51dd669cfd28eb50e07686623d8975710 Mon Sep 17 00:00:00 2001 From: drpatelh Date: Tue, 1 Dec 2020 15:46:27 +0000 Subject: [PATCH 11/11] Indent section --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c4f77f0b5..2d068e333 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ On release, automated continuous integration tests run the pipeline on a [full-s -profile ``` -> **NB:** The commands to obtain public data and to run the main arm of the pipeline are completely independent. This is intentional because it allows you to download all of the raw data in an initial pipeline run (`results/public_data/`) and then to curate the auto-created samplesheet based on the available sample metadata before you run the pipeline again properly. + > **NB:** The commands to obtain public data and to run the main arm of the pipeline are completely independent. This is intentional because it allows you to download all of the raw data in an initial pipeline run (`results/public_data/`) and then to curate the auto-created samplesheet based on the available sample metadata before you run the pipeline again properly. See [usage docs](https://nf-co.re/rnaseq/usage) for all of the available options when running the pipeline.