Merge pull request #512 from drpatelh/dsl2

Use STAR to build initial index for RSEM - Fixes #511
nf-core · Dec 1, 2020 · 5288244 · 5288244
2 parents 9b54e51 + 641dfbc
commit 5288244
Show file tree

Hide file tree

Showing 9 changed files with 62 additions and 24 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 * Updated pipeline template to nf-core/tools `1.12`
 * [[#500](https://github.com/nf-core/rnaseq/issues/500), [#509](https://github.com/nf-core/rnaseq/issues/509)] - Error with AWS batch params
+* [[#511](https://github.com/nf-core/rnaseq/issues/511)] - rsem/star index fails with large genome
 
 ## [[2.0](https://github.com/nf-core/rnaseq/releases/tag/2.0)] - 2020-11-12
 

diff --git a/README.md b/README.md
@@ -82,6 +82,8 @@ On release, automated continuous integration tests run the pipeline on a [full-s
             -profile <docker/singularity/podman/conda/institute>
         ```
 
+    > **NB:** The commands to obtain public data and to run the main arm of the pipeline are completely independent. This is intentional because it allows you to download all of the raw data in an initial pipeline run (`results/public_data/`) and then to curate the auto-created samplesheet based on the available sample metadata before you run the pipeline again properly.
+
 See [usage docs](https://nf-co.re/rnaseq/usage) for all of the available options when running the pipeline.
 
 ## Documentation

diff --git a/bin/deseq2_qc.r b/bin/deseq2_qc.r
@@ -54,11 +54,11 @@ if (is.null(opt$count_file)){
 ################################################
 ################################################
 
-count.table             <- read.delim(file=opt$count_file,header=TRUE)
-rownames(count.table)   <- count.table[,opt$id_col]
-count.table             <- count.table[,opt$count_col:ncol(count.table),drop=FALSE]
-colnames(count.table)   <- gsub(opt$sample_suffix,"",colnames(count.table))
-colnames(count.table)   <- as.character(lapply(colnames(count.table), function (x) tail(strsplit(x,'.',fixed=TRUE)[[1]],1)))
+count.table           <- read.delim(file=opt$count_file,header=TRUE)
+rownames(count.table) <- count.table[,opt$id_col]
+count.table           <- count.table[,opt$count_col:ncol(count.table),drop=FALSE]
+colnames(count.table) <- gsub(opt$sample_suffix,"",colnames(count.table))
+colnames(count.table) <- gsub(pattern='\\.$', replacement='', colnames(count.table))
 
 ################################################
 ################################################

diff --git a/conf/modules.config b/conf/modules.config
@@ -146,7 +146,7 @@ params {
             publish_dir   = "${params.aligner}/bigwig"
         }
         'deseq2_qc' {
-            args          = "--id_col 1 --sample_suffix '' --outdir ./ --outprefix deseq2"
+            args          = "--id_col 1 --sample_suffix '' --outprefix deseq2"
             publish_files = ['RData':'', 'pca.vals.txt':'', 'plots.pdf':'', 'sample.dists.txt':'', 'size_factors':'', 'log':'']
             publish_dir   = "${params.aligner}/deseq2_qc"
         }

diff --git a/conf/test.config b/conf/test.config
@@ -8,7 +8,7 @@
  */
 
 params {
-  config_profile_name = 'Test profile'
+  config_profile_name        = 'Test profile'
   config_profile_description = 'Minimal test dataset to check pipeline function'
 
   // Limit resources so that this can run CI
@@ -34,4 +34,13 @@ params {
   // Other parameters
   pseudo_aligner      = 'salmon'
   umitools_bc_pattern = 'NNNN'
+
+  // When using RSEM, remove warning from STAR whilst building tiny indices
+  modules {
+    'rsem_preparereference' {
+      args2 = "--genomeSAindexNbases 7"
+    }
+  }
 }
+
+
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -8,14 +8,14 @@
  */
 
 params {
-  config_profile_name = 'Full test profile'
+  config_profile_name        = 'Full test profile'
   config_profile_description = 'Full test dataset to check pipeline function'
 
   // Input data for full size test
-  input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/samplesheet_full.csv'
+  input          = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/samplesheet_full.csv'
 
-  // Genome references
-  genome = 'GRCh37'
+  // Genome reference
+  genome         = 'GRCh37'
 
   // Other parameters
   pseudo_aligner = 'salmon'

diff --git a/conf/test_sra.config b/conf/test_sra.config
@@ -8,7 +8,7 @@
  */
 
 params {
-  config_profile_name = 'Public data download test profile'
+  config_profile_name        = 'Public data download test profile'
   config_profile_description = 'Minimal test dataset to check pipeline function when downloading data via the ENA'
 
   // Limit resources so that this can run CI

diff --git a/modules/nf-core/software/rsem/preparereference/main.nf b/modules/nf-core/software/rsem/preparereference/main.nf
@@ -24,15 +24,41 @@ process RSEM_PREPAREREFERENCE {
 
     script:
     def software = getSoftwareName(task.process)
-    """
-    mkdir rsem
-    rsem-prepare-reference \\
-        --gtf $gtf \\
-        --num-threads $task.cpus \\
-        $options.args \\
-        $fasta \\
-        rsem/genome
+    def args     = options.args.tokenize()
+    if (args.contains('--star')) {
+        args.removeIf { it.contains('--star') }
+        def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : ''
+        """
+        mkdir rsem
+        STAR \\
+            --runMode genomeGenerate \\
+            --genomeDir rsem/ \\
+            --genomeFastaFiles $fasta \\
+            --sjdbGTFfile $gtf \\
+            --runThreadN $task.cpus \\
+            $memory \\
+            $options.args2
+        
+        rsem-prepare-reference \\
+            --gtf $gtf \\
+            --num-threads $task.cpus \\
+            ${args.join(' ')} \\
+            $fasta \\
+            rsem/genome
 
-    rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g" > ${software}.version.txt
-    """
+        rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g" > ${software}.version.txt
+        """
+    } else {
+        """
+        mkdir rsem
+        rsem-prepare-reference \\
+            --gtf $gtf \\
+            --num-threads $task.cpus \\
+            $options.args \\
+            $fasta \\
+            rsem/genome
+
+        rsem-calculate-expression --version | sed -e "s/Current version: RSEM v//g" > ${software}.version.txt
+        """
+    }
 }
diff --git a/rnaseq.nf b/rnaseq.nf
@@ -299,9 +299,9 @@ workflow RNASEQ {
         ch_input
     )
     .map {
-        meta, bam ->
+        meta, fastq ->
             meta.id = meta.id.split('_')[0..-2].join('_')
-            [ meta, bam ] }
+            [ meta, fastq ] }
     .groupTuple(by: [0])
     .map { it ->  [ it[0], it[1].flatten() ] }
     .set { ch_cat_fastq }