From ea4f2a2bae51d8546fb2ed466ec469213839553c Mon Sep 17 00:00:00 2001 From: BIOPZ-Katsantoni Maria Date: Tue, 10 Jan 2023 11:16:22 +0100 Subject: [PATCH 1/4] feat: add single-strand preprocessing subworkflow --- workflow/rules/rcrunch_preprocessing_se.smk | 208 ++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 workflow/rules/rcrunch_preprocessing_se.smk diff --git a/workflow/rules/rcrunch_preprocessing_se.smk b/workflow/rules/rcrunch_preprocessing_se.smk new file mode 100644 index 0000000..01504fb --- /dev/null +++ b/workflow/rules/rcrunch_preprocessing_se.smk @@ -0,0 +1,208 @@ +# ________________________________________________________________________________ +# Preprocessing subworkflow for paired-end data. First step of analysis +# ________________________________________________________________________________ +import os +rule create_index: + """ + Create an index of the human genome + """ + input: + genome = config["genome"], + gtf = config["gtf"] + + output: + chromosome_info = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt"), + + chromosomes_names = os.path.join( + config["output_dir"], + "STAR_index", + "chrName.txt") + + params: + cluster_log_path = config["cluster_log"], + output_dir = os.path.join( + config["output_dir"], + "STAR_index"), + outFileNamePrefix = os.path.join( + config["output_dir"], + "STAR_index/STAR_"), + sjdbOverhang = config["sjdbOverhang"] + + singularity: + "docker://zavolab/star:2.6.0a" + + threads: 12 + + log: + stdout = os.path.join( + config["local_log"], + "preprocessing", + "STAR_index.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "preprocessing", + "STAR_index.stderr.log" + ), + + shell: + "(mkdir -p {params.output_dir}; \ + chmod -R 777 {params.output_dir}; \ + STAR \ + --runMode genomeGenerate \ + --sjdbOverhang {params.sjdbOverhang} \ + --genomeDir {params.output_dir} \ + --genomeFastaFiles {input.genome} \ + --runThreadN {threads} \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --sjdbGTFfile {input.gtf}) 1> {log.stdout} 2> {log.stderr}" + + +rule create_csv_from_ensembl_gtf: + """ + Create a csv based on the gtf file of ensembl + """ + input: + gtf = config["gtf"] + + output: + output = os.path.join( + config["output_dir"], + "ensembl_csv", + "ensembl.csv"), + + flag = os.path.join( + config["output_dir"], + "ensembl_csv", + "done"), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_ensembl__gtf_to_csv.py" + ), + + singularity: + "docker://zavolab/rcrunch_python:1.0" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "preprocessing", + "ensembl_csv.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "preprocessing", + "ensembl_csv.stderr.log" + ), + + shell: + "(python {params.script} \ + --gtf_file {input.gtf} \ + --flag {output.flag} \ + --out_file {output.output}) 1> {log.stdout} 2> {log.stderr}" + +rule cutadapt: + """ + Trim 3' and 5' adapters + """ + input: + reads = lambda wildcards: + os.path.join( + config["input_dir"], + config[wildcards.sample]['mate1'] + '.fastq.gz'), + mate1_3p = lambda wildcards: config[wildcards.sample]['mate1_3p'], + mate1_5p = lambda wildcards: config[wildcards.sample]['mate1_5p'], + output: + reads = os.path.join( + config["output_dir"], + "cutadapt_replicates", + "{sample}.cutadapt_standard.fastq.gz") + + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://zavolab/cutadapt:1.16" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "preprocessing", + "cutadapt__{sample}.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "preprocessing", + "cutadapt__{sample}.stderr.log" + ) + + shell: + "cutadapt \ + -e 0.1 \ + -O 1 \ + -j {threads} \ + -m 2 \ + -n 1 \ + -a file:{input.mate1_3p} \ + -g file:{input.mate1_5p} \ + -o {output.reads} \ + {input.reads}) 1> {log.stdout} 2> {log.stderr}" + + +rule umi_tools_format: + """ + Create a UMI-tools barcode collapse format for any ENCODE samples. + """ + input: + reads = os.path.join( + config["output_dir"], + "cutadapt_replicates2", + "{sample}.cutadapt_standard.fastq.gz") + + output: + reads = os.path.join( + config["output_dir"], + "cutadapt_replicates2", + "{sample}.cutadapt_encode.fastq.gz") + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_convert_fastq_to_umi_format.py" + ), + read_format = lambda wildcards: + config[wildcards.sample]['format'] + + singularity: + "docker://zavolab/rcrunch_python:1.0" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "preprocessing", + "umi_format__{sample}.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "preprocessing", + "umi_format__{sample}.stderr.log" + ), + shell: + "(python {params.script} \ + --infile {input.reads} \ + --outfile {output.reads} \ + --read_format {params.read_format};) 1> {log.stdout} 2> {log.stderr}" From 93cfe62b1189d9fc2d7a86d4d216f533f5281716 Mon Sep 17 00:00:00 2001 From: BIOPZ-Katsantoni Maria Date: Fri, 13 Jan 2023 10:37:20 +0100 Subject: [PATCH 2/4] feat: single and paired-end implementation --- test/test_singularity_execution/config.yaml | 18 +- workflow/Snakefile | 26 +- workflow/rules/common.smk | 38 + workflow/rules/rcrunch_genomic.smk | 1457 +++++++++ workflow/rules/rcrunch_genomic_pe.smk | 1511 +-------- workflow/rules/rcrunch_genomic_se.smk | 151 + workflow/rules/rcrunch_preprocessing.smk | 111 + workflow/rules/rcrunch_preprocessing_pe.smk | 136 +- workflow/rules/rcrunch_preprocessing_se.smk | 135 +- workflow/rules/rcrunch_transcriptomic.smk | 2349 ++++++++++++++ workflow/rules/rcrunch_transcriptomic_pe.smk | 2888 ++---------------- workflow/rules/rcrunch_transcriptomic_se.smk | 508 +++ 12 files changed, 4966 insertions(+), 4362 deletions(-) create mode 100644 workflow/rules/common.smk create mode 100644 workflow/rules/rcrunch_genomic.smk create mode 100644 workflow/rules/rcrunch_genomic_se.smk create mode 100644 workflow/rules/rcrunch_preprocessing.smk create mode 100644 workflow/rules/rcrunch_transcriptomic.smk create mode 100644 workflow/rules/rcrunch_transcriptomic_se.smk diff --git a/test/test_singularity_execution/config.yaml b/test/test_singularity_execution/config.yaml index 0980356..648b1fa 100644 --- a/test/test_singularity_execution/config.yaml +++ b/test/test_singularity_execution/config.yaml @@ -29,7 +29,7 @@ # ------------------------------------------ # Experiment specific info - REQUIRED - EXPERIMENT_SET : ["PUM2_K562_ENCSR661ICQ_2"] + EXPERIMENT_SET : ["PUM2_K562_ENCSR661ICQ_2_paired_end", "PUM2_K562_ENCSR661ICQ_2_single_end"] # Experiment specific info - REQUIRED # ***Note: an experiment refers to the foreground (replicates) and background (smis) samples @@ -38,7 +38,7 @@ # as one sample (This functionality is useful for treating technical replicates as one sample) # Make sure that the samples you merge have the same sample features (e.g paired, sense, dup_type) # Choose an informative experiment name - PUM2_K562_ENCSR661ICQ_2: { + PUM2_K562_ENCSR661ICQ_2_paired_end: { rbp: "PUM2", replicates: ["ENCFF041KJT_ENCFF462SCV"], smis: ["ENCFF616FCF_ENCFF495ZPY"], @@ -46,14 +46,23 @@ window_b: 300, step_size: 150, background_type: "standard"} + PUM2_K562_ENCSR661ICQ_2_single_end: { + rbp: "PUM2", + replicates: ["ENCFF462SCV"], + smis: ["ENCFF495ZPY"], + window_f: 300, + window_b: 300, + step_size: 150, + background_type: "standard"} # Sample specific info - REQUIRED + ENCFF462SCV: { mate1: "ENCFF462SCV.chr20", paired: 1, sense: 1, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" } + ENCFF495ZPY: { mate1: "ENCFF495ZPY.chr20", paired: 1, sense: 1, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" } ENCFF041KJT_ENCFF462SCV: { mate1: "ENCFF041KJT.chr20", mate2: "ENCFF462SCV.chr20", paired: 2, sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" } ENCFF616FCF_ENCFF495ZPY: { mate1: "ENCFF616FCF.chr20", mate2: "ENCFF495ZPY.chr20", paired: 2, sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" } - # ---------------------------------------------------------------------------------------------- # RCRUNCH specific options - REQUIRED - DEFAULT VALUES SUGGESTED - method_types : ["GN"] # options: GN, TR if you want BOTH methods, you can specify ['GN', 'TR'] + method_types : ["GN", "TR"] # options: GN, TR if you want BOTH methods, you can specify ['GN', 'TR'] # This will create both transcriptomic and genomic predictions multimappers: 1 # change if you want to accept reads that map to multiple regions # (the number indicates the number of regions a read can map to at @@ -65,7 +74,6 @@ ## RCRUNCH specific options - DEFAULTS - Use with care - seq_type: "pe" # paired end - currently only paired-end seq are allowed motif_lengths: [6, 8] # Motif sizes to be considered. peak_center : ["crosslink", "peak_center"] # whether to use 'crosslink' (position within where most reads start) # or 'peak_center' as the center of the peak diff --git a/workflow/Snakefile b/workflow/Snakefile index 7e42f1c..8e61a59 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -46,20 +46,16 @@ rule finish: # experiment=config[config["EXPERIMENT_SET"][0]]["replicates"]) - -if config['seq_type'] == 'pe': - if ('TR' in config['method_types']) & ('GN' in config['method_types']): - include: 'rules/rcrunch_preprocessing_pe.smk' - include: 'rules/rcrunch_genomic_pe.smk' - include: 'rules/rcrunch_transcriptomic_pe.smk' - include: 'rules/rcrunch_motif_analysis.smk' - elif 'TR' in config['method_types']: - include: 'rules/rcrunch_preprocessing_pe.smk' - include: 'rules/rcrunch_transcriptomic_pe.smk' - include: 'rules/rcrunch_motif_analysis.smk' - elif 'GN' in config['method_types']: - include: 'rules/rcrunch_preprocessing_pe.smk' - include: 'rules/rcrunch_genomic_pe.smk' - include: 'rules/rcrunch_motif_analysis.smk' +include: 'rules/rcrunch_preprocessing_pe.smk' +include: 'rules/rcrunch_preprocessing_se.smk' +include: 'rules/rcrunch_preprocessing.smk' +include: 'rules/rcrunch_genomic_pe.smk' +include: 'rules/rcrunch_genomic_se.smk' +include: 'rules/rcrunch_genomic.smk' +include: 'rules/rcrunch_transcriptomic.smk' +include: 'rules/rcrunch_transcriptomic_pe.smk' +include: 'rules/rcrunch_transcriptomic_se.smk' +include: 'rules/rcrunch_motif_analysis.smk' +include: 'rules/common.smk' diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk new file mode 100644 index 0000000..fb1b213 --- /dev/null +++ b/workflow/rules/common.smk @@ -0,0 +1,38 @@ +def get_mates(sample): + """Get number of mates""" + try: + if config[sample]['mate2']: + return "pe" + else: + return "se" + except KeyError: + return "se" + +def get_mates_number(sample): + """Get library type""" + try: + if config[sample]['mate2']: + return 2 + else: + return 1 + except KeyError: + return 2 + +def get_library_type(sample, sense): + try: + if config[sample]['mate2']: + if str(int(sense)) == '2': + return "ISR" + if str(int(sense)) == '1': + return "ISF" + else: + if str(int(sense)) == '2': + return "SR" + if str(int(sense)) == '1': + return "SF" + except KeyError: + if str(int(sense)) == '2': + return "SR" + if str(int(sense)) == '1': + return "SF" + \ No newline at end of file diff --git a/workflow/rules/rcrunch_genomic.smk b/workflow/rules/rcrunch_genomic.smk new file mode 100644 index 0000000..e117c45 --- /dev/null +++ b/workflow/rules/rcrunch_genomic.smk @@ -0,0 +1,1457 @@ +# ________________________________________________________________________________ +# Genomic rcrunch for paired end CLIP data +# ________________________________________________________________________________ +import os + + +rule GN_index_mappings: + """ + Sort and index alignments + according to coordinates + """ + input: + bam = lambda wildcards: expand( + os.path.join( + config["output_dir"], + "GN", + "alignment", + "{sample}", + "{sample}.{mates}.bam"), + sample=wildcards.sample, + mates=get_mates(wildcards.sample)), + + logfile = lambda wildcards: expand( + os.path.join( + config["output_dir"], + "GN", + "alignment", + "{sample}", + "{sample}.{mates}.Log.final.out"), + sample=wildcards.sample, + mates=get_mates(wildcards.sample)), + + output: + bam = temp(os.path.join( + config["output_dir"], + "GN", + "alignment", + "{sample}_sorted.bam" + )), + + bai = temp(os.path.join( + config["output_dir"], + "GN", + "alignment", + "{sample}_sorted.bam.bai" + )), + params: + cluster_log_path = config["cluster_log"], + prefix = os.path.join( + config["output_dir"], + "GN", + "alignment", + "{sample}_temp" + ), + + singularity: + "docker://zavolab/samtools:1.8" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{sample}", + "sort_mappings.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{sample}", + "sort_mappings.stderr.log" + ), + + shell: + "(samtools sort \ + -T {params.prefix} \ + -@ {threads} \ + {input.bam} > {output.bam}; \ + samtools index \ + {output.bam} {output.bai} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_flag_ncRNA_reads: + """ + Remove reads mapping to specific + nc categories specified in the config + - identification step + """ + input: + bam = os.path.join( + config["output_dir"], + "GN", + "alignment", + "{sample}_sorted.bam" + ), + bai = os.path.join( + config["output_dir"], + "GN", + "alignment", + "{sample}_sorted.bam.bai" + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_filter_ncRNAs.py" + ), + ncRNAs = config['ncRNAs'], + ncRNA_biotypes = expand(config['ncRNA_biotypes']), + paired = lambda wildcards: + config[wildcards.sample]['paired'], + sense = lambda wildcards: + config[wildcards.sample]['sense'], + + output: + outfile = temp(os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.ncrna.txt")), + flag = temp(os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}_done" + )), + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{sample}", + "flag_ncRNA_reads.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{sample}", + "flag_ncRNA_reads.stderr.log" + ), + + shell: + "(python {params.script} \ + --bamfile {input.bam} \ + --RNA_central {params.ncRNAs} \ + --outfile {output.outfile} \ + --paired {params.paired} \ + --sense {params.sense} \ + --ncRNAs {params.ncRNA_biotypes} \ + --flag {output.flag} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_remove_ncRNA_reads: + """ + Removal of the ncreads + - removal step + """ + input: + bam = os.path.join( + config["output_dir"], + "GN", + "alignment", + "{sample}_sorted.bam" + ), + read_names = os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.ncrna.txt" + ), + + output: + bam = temp(os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.filtered.bam" + )), + + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://zavolab/picard:2.18.9" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{sample}", + "remove_ncRNA_reads.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{sample}", + "remove_ncRNA_reads.stderr.log" + ), + + shell: + "(java -jar /usr/local/bin/picard.jar FilterSamReads \ + I={input.bam} \ + O={output.bam} \ + READ_LIST_FILE={input.read_names} \ + FILTER=excludeReadList \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_sort_ncRNA_rm: + """ + Sort and index alignments + according to coordinates + """ + input: + bam = os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.filtered.bam" + ), + + output: + bam = temp(os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.filtered.sorted.bam" + )), + bai = temp(os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.filtered.sorted.bam.bai" + )), + + params: + cluster_log_path = config["cluster_log"], + prefix = os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}_temp" + ), + + singularity: + "docker://zavolab/samtools:1.8" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{sample}", + "sort_ncrna_rm.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{sample}", + "sort_ncrna_rm.stderr.log" + ), + + shell: + "(samtools sort \ + -T {params.prefix} \ + -@ {threads} \ + {input.bam} > {output.bam}; \ + samtools index \ + {output.bam} {output.bai} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_flag_duplicates: + """ + Duplicate removal in the absence of UMIs + - detection step + """ + input: + bam = os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.filtered.sorted.bam" + ), + bai = os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.filtered.sorted.bam.bai" + ), + + output: + bam = temp(os.path.join( + config["output_dir"], + "GN", + "flag_duplicates", + "{sample}.duplicates.Processed.out.bam")) + + params: + cluster_log_path = config["cluster_log"], + output_dir = os.path.join( + config["output_dir"], + "GN", + "flag_duplicates" + ), + outFileNamePrefix = os.path.join( + config["output_dir"], + "GN", + "flag_duplicates", + "{sample}.duplicates." + ), + + singularity: + "docker://zavolab/star:2.6.0a" + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{sample}", + "flag_duplicates.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{sample}", + "flag_duplicates.stderr.log" + ), + + shell: + "(STAR \ + --inputBAMfile {input.bam} \ + --bamRemoveDuplicatesType UniqueIdenticalNotMulti \ + --runMode inputAlignmentsFromBAM \ + --outFileNamePrefix {params.outFileNamePrefix} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_remove_duplicates: + """ + Duplicate removal in the absence of UMIs + - removal step + """ + input: + bam = os.path.join( + config["output_dir"], + "GN", + "flag_duplicates", + "{sample}.duplicates.Processed.out.bam" + ), + + output: + bam = temp(os.path.join( + config["output_dir"], + "GN", + "remove_duplicates", + "{sample}.duplicates.bam" + )), + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_filter_duplicates.py" + ), + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{sample}", + "remove_duplicates.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{sample}", + "remove_duplicates.stderr.log" + ), + + shell: + "(python {params.script} \ + --bamfile {input.bam} \ + --outfile {output.bam} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_no_duplicate_removal: + """ + No duplicate removal + """ + input: + bam = os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.filtered.sorted.bam" + ), + bai = os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.filtered.sorted.bam.bai" + ), + + output: + bam = temp(os.path.join( + config["output_dir"], + "GN", + "remove_duplicates", + "{sample}.with_duplicates.bam" + )), + bai = temp(os.path.join( + config["output_dir"], + "GN", + "remove_duplicates", + "{sample}.with_duplicates.bam.bai" + )), + + singularity: + "docker://bash:5.0.16" + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{sample}", + "no_duplicate_collapse.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{sample}", + "no_duplicate_collapse.stderr.log" + ), + + shell: + "(cp {input.bam} {output.bam}; \ + cp {input.bai} {output.bai}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_merge: + input: + bam = lambda wildcards: + expand( + os.path.join( + config["output_dir"], + "GN", + "remove_duplicates", + "{sample}.{dup_type}.{mates}.bam"), + sample=config[wildcards.experiment][wildcards.name], + dup_type=config[config[wildcards.experiment][wildcards.name][0]]['dup_type'], + mates=get_mates(config[wildcards.experiment][wildcards.name][0]), + ), + output: + bam = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_{name}_merge_samples.bam" + )), + + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://zavolab/samtools:1.8" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "merge_samples__{name}.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "merge_samples__{name}.stderr.log" + ), + + shell: + "(samtools merge \ + {output.bam} {input.bam}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_sort_merged: + input: + bam = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_{name}_merge_samples.bam" + ), + + output: + bam = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_{name}_merge_samples.sorted.bam" + ), + bai = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_{name}_merge_samples.sorted.bam.bai" + ), + + params: + cluster_log_path = config["cluster_log"], + prefix = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_{name}_merge_samples_temp") + + singularity: + "docker://zavolab/samtools:1.8" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "merge_samples__{name}.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "merge_samples__{name}.stderr.log" + ), + + shell: + "(samtools sort \ + -T {params.prefix} \ + -@ {threads} \ + {input.bam} > {output.bam}; \ + samtools index \ + {output.bam} {output.bai} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_read_frequencies: + input: + bam = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_{name}_merge_samples.sorted.bam" + ), + + output: + frequencies = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_{name}.frequencies.csv" + )), + + params: + cluster_log_path = config["cluster_log"], + paired = lambda wildcards: + config[config[wildcards.experiment][wildcards.name][0]]['paired'], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_bam_get_read_frequencies.py" + ), + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "read_frequencies__{name}.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "read_frequencies__{name}.stderr.log" + ), + + shell: + "(python {params.script} \ + --bamfile {input.bam} \ + --paired {params.paired} \ + --outfile {output.frequencies}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_bam_to_bed: + """ + Convert bamfile to bedfile using bedtools + """ + input: + bam = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_{name}_merge_samples.sorted.bam" + ), + + output: + bed = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_{name}_merge_samples.bed" + )), + + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://zavolab/bedtools:2.27.0" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "bam_to_bed___{name}.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "bam_to_bed__{name}.stderr.log" + ), + + shell: + "(bedtools bamtobed \ + -i {input.bam} > {output.bed}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_genome_coverage: + """ + Obtain windows of read coverage genome + """ + input: + chromosome_info = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt" + ), + fg_bed = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_replicates_merge_samples.bed" + ), + bg_bed = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_smis_merge_samples.bed" + ), + fg_frequencies = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_replicates.frequencies.csv" + ), + bg_frequencies = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_smis.frequencies.csv" + ), + + output: + coverage = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}.gn_coverage.bed" + )), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "sliding_windows.py" + ), + output_dir = os.path.join( + config["output_dir"], + "GN", + "{experiment}"), + window_f = lambda wildcards: + config[wildcards.experiment]["window_f"], + window_b = lambda wildcards: + config[wildcards.experiment]["window_b"], + step_size = lambda wildcards: + config[wildcards.experiment]["step_size"], + sense_f = lambda wildcards: + config[config[wildcards.experiment]["replicates"][0]]['sense'], + sense_b = lambda wildcards: + config[config[wildcards.experiment]["smis"][0]]['sense'], + paired_f = lambda wildcards: + config[config[wildcards.experiment]["replicates"][0]]['paired'], + paired_b = lambda wildcards: + config[config[wildcards.experiment]["smis"][0]]['paired'], + prefix = "{experiment}.gn_coverage", + background_type = lambda wildcards: + config[wildcards.experiment]["background_type"], + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 12 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "genome_coverage.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "genome_coverage.stderr.log" + ), + + shell: + "(python {params.script} \ + --bed_foreground {input.fg_bed} \ + --bed_background {input.bg_bed} \ + --foreground_frequencies {input.fg_frequencies} \ + --background_frequencies {input.bg_frequencies} \ + --out {params.output_dir} \ + --prefix {params.prefix} \ + --chromosomes {input.chromosome_info} \ + --sense_f {params.sense_f} \ + --sense_b {params.sense_b} \ + --paired_f {params.paired_f} \ + --paired_b {params.paired_b} \ + --window_f {params.window_f} \ + --window_b {params.window_b} \ + --step_size {params.step_size} \ + --data_type genome \ + --background {params.background_type} \ + --cutoff 1 \ + --threads {threads}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_enriched_regions: + """ + Obtain enriched regions for binding based on EM using + bg for noise estimation. Modified and adapted from: + Berger, Severin M., et al. "Crunch: Integrated processing and modeling of + ChIP-seq data in terms of regulatory motifs." + Genome research (2019): gr-239319 + Custom script. + """ + input: + reads = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}.gn_coverage.bed" + ), + + output: + reads = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_enriched_regions.csv" + ), + parameters = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_parameters.csv" + ), + all_regions = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_total_regions.csv" + ), + + params: + cluster_log_path = config["cluster_log"], + plot_dir = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "enriched_regions_plots"), + fdr_cutoff = config['FDR'], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_find_enriched_regions.py" + ), + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "enriched_regions.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "enriched_regions.stderr.log" + ), + + shell: + "(mkdir -p {params.plot_dir}; \ + python {params.script} \ + --windows_table {input.reads} \ + --out {output.reads} \ + --out_parameters {output.parameters} \ + --all_regions {output.all_regions} \ + --out_folder {params.plot_dir} \ + --fdr_cutoff {params.fdr_cutoff} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_fit_peaks: + """ + Obtain peaks + """ + input: + fg_bam = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_replicates_merge_samples.sorted.bam" + ), + bg_bam = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_smis_merge_samples.sorted.bam" + ), + fg_frequencies = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_replicates.frequencies.csv" + ), + bg_frequencies = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_smis.frequencies.csv" + ), + reads = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_enriched_regions.csv" + ), + parameters = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_parameters.csv" + ), + chromosomes_g = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt" + ), + + output: + peaks = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_total_peaks.csv" + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_define_peaks.py" + ), + plot_dir = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "fit_peaks_plots"), + fragment_size = config["fragment_size"], + window_size = lambda wildcards: + config[wildcards.experiment]['window_f'], + paired_f = lambda wildcards: + config[config[wildcards.experiment]["replicates"][0]]['paired'], + paired_b = lambda wildcards: + config[config[wildcards.experiment]["smis"][0]]['paired'], + sense_f = lambda wildcards: + config[config[wildcards.experiment]["replicates"][0]]['sense'], + sense_b = lambda wildcards: + config[config[wildcards.experiment]["smis"][0]]['sense'], + background_type = lambda wildcards: + config[wildcards.experiment]["background_type"], + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 12 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "fit_peaks.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "fit_peaks.stderr.log" + ), + + shell: + "(mkdir -p {params.plot_dir}; \ + python {params.script} \ + --enriched_regions {input.reads} \ + --parameters {input.parameters} \ + --fragment_size {params.fragment_size} \ + --window_size {params.window_size} \ + --chromosomes_g {input.chromosomes_g} \ + --bam_foreground_g {input.fg_bam} \ + --bam_background_g {input.bg_bam} \ + --bam_fg_fq {input.fg_frequencies} \ + --bam_bg_fq {input.bg_frequencies} \ + --paired_f {params.paired_f} \ + --paired_b {params.paired_b} \ + --sense_f {params.sense_f} \ + --sense_b {params.sense_b} \ + --background {params.background_type} \ + --out {output.peaks} \ + --out_folder {params.plot_dir} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_peaks_split_sets_crosslink: + """ + Split peaks into training and test set + """ + input: + genome_fasta = config["genome"], + all_regions = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_total_regions.csv" + ), + reads = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_total_peaks.csv" + ), + chromosomes_g = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt" + ), + + output: + training = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink", + "training.fasta" + )), + test = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink", + "test.fasta" + )), + training_bg = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink", + "training_bg.fasta" + )), + test_bg = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink", + "test_bg.fasta" + )), + dirtemp = temp( + directory( + os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink", + "tmpdir") + ) + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_peaks_split_sets.py" + ), + output_dir = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink" + ), + genome_tag = config["genome_tag"], + peak_size = 50, + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "peaks_split_sets__{run}_crosslink.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "peaks_split_sets__{run}_crosslink.stderr.log" + ), + + shell: + "(mkdir -p {params.output_dir}; \ + mkdir -p {output.dirtemp}; \ + python {params.script} \ + --peaks_file {input.reads} \ + --genome_fasta {input.genome_fasta} \ + --all_regions {input.all_regions} \ + --genome_tag {params.genome_tag} \ + --tempdir {output.dirtemp} \ + --chromosomes_g {input.chromosomes_g} \ + --peak_size {params.peak_size} \ + --out_folder {params.output_dir} \ + --crosslink_type 'crosslink' \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_peaks_split_sets: + """ + Split peaks into training and test set + """ + input: + genome_fasta = config["genome"], + all_regions = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_total_regions.csv" + ), + reads = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_total_peaks.csv" + ), + chromosomes_g = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt" + ), + + output: + training = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center", + "training.fasta" + )), + test = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center", + "test.fasta" + )), + training_bg = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center", + "training_bg.fasta" + )), + test_bg = temp(os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center", + "test_bg.fasta" + )), + dirtemp = temp( + directory( + os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center", + "tmpdir") + ) + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_peaks_split_sets.py" + ), + output_dir = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center" + ), + genome_tag = config["genome_tag"], + peak_size = 50 + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "peaks_split_sets__{run}_peak_center.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "motif_analysis_crossvalidation", + "peaks_split_sets__{run}_peak_center.stderr.log" + ), + + shell: + "(mkdir -p {params.output_dir}; \ + mkdir -p {output.dirtemp}; \ + python {params.script} \ + --peaks_file {input.reads} \ + --genome_fasta {input.genome_fasta} \ + --all_regions {input.all_regions} \ + --genome_tag {params.genome_tag} \ + --tempdir {output.dirtemp} \ + --chromosomes_g {input.chromosomes_g} \ + --peak_size {params.peak_size} \ + --out_folder {params.output_dir} \ + --crosslink_type 'peak_center' \ + ) 1> {log.stdout} 2> {log.stderr}" + +rule GN_peaks_split_sets_crosslink_all_motifs: + """ + Get all peaks crosslink + """ + input: + genome_fasta = config["genome"], + all_regions = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_total_regions.csv" + ), + reads = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_total_peaks.csv" + ), + chromosomes_g = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt" + ), + + output: + test = temp( + os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_final", + "crosslink", + "test.fasta") + ), + test_bg = temp( + os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_final", + "crosslink", + "test_bg.fasta") + ), + dirtemp = temp( + directory( + os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_final", + "crosslink", + "tmpdir") + ) + ), + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "total_enrichment_peakstofasta.py" + ), + output_dir = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_final", + "crosslink", + ), + genome_tag = config["genome_tag"], + peak_size = 50, + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "motif_analysis_final", + "peaks_split_sets__all_motifs_crosslink.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "motif_analysis_final", + "peaks_split_sets__all_motifs_crosslink.stderr.log" + ), + + shell: + "(mkdir -p {params.output_dir}; \ + mkdir -p {output.dirtemp}; \ + python {params.script} \ + --peaks_file {input.reads} \ + --genome_fasta {input.genome_fasta} \ + --all_regions {input.all_regions} \ + --genome_tag {params.genome_tag} \ + --tempdir {output.dirtemp} \ + --chromosomes_g {input.chromosomes_g} \ + --peak_size {params.peak_size} \ + --out_folder {params.output_dir} \ + --crosslink_type 'crosslink' \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_peaks_split_sets_all_motifs: + """ + get all peaks crosslink + """ + input: + genome_fasta = config["genome"], + all_regions = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_total_regions.csv" + ), + reads = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "{experiment}_total_peaks.csv" + ), + chromosomes_g = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt" + ), + + output: + test = temp( + os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_final", + "peak_center", + "test.fasta") + ), + test_bg = temp( + os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_final", + "peak_center", + "test_bg.fasta") + ), + dirtemp = temp( + directory( + os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_final", + "peak_center", + "tmpdir") + ) + ), + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "total_enrichment_peakstofasta.py" + ), + output_dir = os.path.join( + config["output_dir"], + "GN", + "{experiment}", + "motif_analysis_final", + "peak_center", + ), + genome_tag = config["genome_tag"], + peak_size = 50 + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "motif_analysis_final", + "peaks_split_sets__all_motifs_standard.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{experiment}", + "motif_analysis_final", + "peaks_split_sets__all_motifs_standard.stderr.log" + ), + + shell: + "(mkdir -p {params.output_dir}; \ + mkdir -p {output.dirtemp}; \ + python {params.script} \ + --peaks_file {input.reads} \ + --genome_fasta {input.genome_fasta} \ + --all_regions {input.all_regions} \ + --genome_tag {params.genome_tag} \ + --tempdir {output.dirtemp} \ + --chromosomes_g {input.chromosomes_g} \ + --peak_size {params.peak_size} \ + --out_folder {params.output_dir} \ + --crosslink_type 'peak_center' \ + ) 1> {log.stdout} 2> {log.stderr}" + diff --git a/workflow/rules/rcrunch_genomic_pe.smk b/workflow/rules/rcrunch_genomic_pe.smk index 97d5040..c9e9985 100644 --- a/workflow/rules/rcrunch_genomic_pe.smk +++ b/workflow/rules/rcrunch_genomic_pe.smk @@ -3,7 +3,7 @@ # ________________________________________________________________________________ import os -rule GN_map_STAR: +rule GN_map_STAR_pe: """ map reads to genome""" input: genome = os.path.join( @@ -16,7 +16,7 @@ rule GN_map_STAR: os.path.join( config["output_dir"], "cutadapt", - "{sample}_mate1.cutadapt_{format}.fastq.gz"), + "{sample}_mate1.pe.cutadapt_{format}.fastq.gz"), sample=wildcards.sample, format=config[wildcards.sample]['format'] ), @@ -25,36 +25,25 @@ rule GN_map_STAR: os.path.join( config["output_dir"], "cutadapt", - "{sample}_mate2.cutadapt_{format}.fastq.gz"), + "{sample}_mate2.pe.cutadapt_{format}.fastq.gz"), sample=wildcards.sample, format=config[wildcards.sample]['format'] ), output: - bamfile = temp(os.path.join( + bamfile = os.path.join( config["output_dir"], "GN", "alignment", "{sample}", - "{sample}.bam" - )), - logfile = temp(os.path.join( + "{sample}.pe.bam" + ), + logfile = os.path.join( config["output_dir"], "GN", "alignment", "{sample}", - "{sample}Log.final.out" - )), - outdir = temp( - directory( - os.path.join( - config["output_dir"], - "GN", - "alignment", - "{sample}") - ) - ), - + "{sample}.pe.Log.final.out"), params: cluster_log_path = config["cluster_log"], sample_id = "{sample}", @@ -67,13 +56,15 @@ rule GN_map_STAR: "GN", "alignment", "{sample}", - "{sample}", + "{sample}.pe.", ), multimappers = config['multimappers'] singularity: "docker://zavolab/star:2.6.0a" + shadow: "full" + threads: 12 log: @@ -81,13 +72,13 @@ rule GN_map_STAR: config["local_log"], "GN", "{sample}", - "map_STAR.stdout.log" + "map_STAR.pe.stdout.log" ), stderr = os.path.join( config["local_log"], "GN", "{sample}", - "map_STAR.stderr.log" + "map_STAR.pe.stderr.log" ), shell: @@ -114,281 +105,9 @@ rule GN_map_STAR: ) 1> {log.stdout} 2> {log.stderr}" -rule GN_index_mappings: - """ - Sort and index alignments - according to coordinates - """ - input: - bam = os.path.join( - config["output_dir"], - "GN", - "alignment", - "{sample}", - "{sample}.bam" - ), - indir = os.path.join( - config["output_dir"], - "GN", - "alignment", - "{sample}" - ), - - output: - bam = temp(os.path.join( - config["output_dir"], - "GN", - "alignment", - "{sample}_sorted.bam" - )), - - bai = temp(os.path.join( - config["output_dir"], - "GN", - "alignment", - "{sample}_sorted.bam.bai" - )), - params: - cluster_log_path = config["cluster_log"], - prefix = os.path.join( - config["output_dir"], - "GN", - "alignment", - "{sample}_temp" - ), - - singularity: - "docker://zavolab/samtools:1.8" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{sample}", - "sort_mappings.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{sample}", - "sort_mappings.stderr.log" - ), - - shell: - "(samtools sort \ - -T {params.prefix} \ - -@ {threads} \ - {input.bam} > {output.bam}; \ - samtools index \ - {output.bam} {output.bai} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_flag_ncRNA_reads: - """ - Remove reads mapping to specific - nc categories specified in the config - - identification step - """ - input: - bam = os.path.join( - config["output_dir"], - "GN", - "alignment", - "{sample}_sorted.bam" - ), - bai = os.path.join( - config["output_dir"], - "GN", - "alignment", - "{sample}_sorted.bam.bai" - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_filter_ncRNAs.py" - ), - ncRNAs = config['ncRNAs'], - ncRNA_biotypes = expand(config['ncRNA_biotypes']), - paired = lambda wildcards: - config[wildcards.sample]['paired'], - sense = lambda wildcards: - config[wildcards.sample]['sense'], - - output: - outfile = temp(os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}.ncrna.txt")), - flag = temp(os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}_done" - )), - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{sample}", - "flag_ncRNA_reads.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{sample}", - "flag_ncRNA_reads.stderr.log" - ), - - shell: - "(python {params.script} \ - --bamfile {input.bam} \ - --RNA_central {params.ncRNAs} \ - --outfile {output.outfile} \ - --paired {params.paired} \ - --sense {params.sense} \ - --ncRNAs {params.ncRNA_biotypes} \ - --flag {output.flag} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_remove_ncRNA_reads: - """ - Removal of the ncreads - - removal step - """ - input: - bam = os.path.join( - config["output_dir"], - "GN", - "alignment", - "{sample}_sorted.bam" - ), - read_names = os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}.ncrna.txt" - ), - - output: - bam = temp(os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}.filtered.bam" - )), - - params: - cluster_log_path = config["cluster_log"], - - singularity: - "docker://zavolab/picard:2.18.9" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{sample}", - "remove_ncRNA_reads.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{sample}", - "remove_ncRNA_reads.stderr.log" - ), - - shell: - "(java -jar /usr/local/bin/picard.jar FilterSamReads \ - I={input.bam} \ - O={output.bam} \ - READ_LIST_FILE={input.read_names} \ - FILTER=excludeReadList \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_sort_ncRNA_rm: - """ - Sort and index alignments - according to coordinates - """ - input: - bam = os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}.filtered.bam" - ), - - output: - bam = temp(os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}.filtered.sorted.bam" - )), - bai = temp(os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}.filtered.sorted.bam.bai" - )), - - params: - cluster_log_path = config["cluster_log"], - prefix = os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}_temp" - ), - - singularity: - "docker://zavolab/samtools:1.8" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{sample}", - "sort_ncrna_rm.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{sample}", - "sort_ncrna_rm.stderr.log" - ), - - shell: - "(samtools sort \ - -T {params.prefix} \ - -@ {threads} \ - {input.bam} > {output.bam}; \ - samtools index \ - {output.bam} {output.bai} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_flag_duplicates: +rule GN_umi_collapse_pe: """ - Duplicate removal in the absence of UMIs - - detection step + Duplicate removal when there are UMIs """ input: bam = os.path.join( @@ -408,1214 +127,40 @@ rule GN_flag_duplicates: bam = temp(os.path.join( config["output_dir"], "GN", - "flag_duplicates", - "{sample}.duplicates.Processed.out.bam")) + "remove_duplicates", + "{sample}.umis.pe.bam" + )), params: cluster_log_path = config["cluster_log"], - output_dir = os.path.join( - config["output_dir"], - "GN", - "flag_duplicates" - ), - outFileNamePrefix = os.path.join( - config["output_dir"], - "GN", - "flag_duplicates", - "{sample}.duplicates." - ), - - singularity: - "docker://zavolab/star:2.6.0a" - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{sample}", - "flag_duplicates.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{sample}", - "flag_duplicates.stderr.log" - ), - - shell: - "(STAR \ - --inputBAMfile {input.bam} \ - --bamRemoveDuplicatesType UniqueIdenticalNotMulti \ - --runMode inputAlignmentsFromBAM \ - --outFileNamePrefix {params.outFileNamePrefix} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_remove_duplicates: - """ - Duplicate removal in the absence of UMIs - - removal step - """ - input: - bam = os.path.join( - config["output_dir"], - "GN", - "flag_duplicates", - "{sample}.duplicates.Processed.out.bam" - ), - - output: - bam = temp(os.path.join( + metrics_file = os.path.join( config["output_dir"], "GN", "remove_duplicates", - "{sample}.duplicates.bam" - )), - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_filter_duplicates.py" + "{sample}.umis.pe.metrics" ), singularity: - "docker://zavolab/rcrunch_python:1.0.5" + "docker://zavolab/umi-tools:0.5.4" log: stdout = os.path.join( config["local_log"], "GN", "{sample}", - "remove_duplicates.stdout.log" + "umi_collapse.pe.stdout.log" ), stderr = os.path.join( config["local_log"], "GN", "{sample}", - "remove_duplicates.stderr.log" + "umi_collapse.pe.stderr.log" ), shell: - "(python {params.script} \ - --bamfile {input.bam} \ - --outfile {output.bam} \ + "(umi_tools \ + dedup \ + -I {input.bam} \ + --paired \ + -S {output.bam} \ ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_no_duplicate_removal: - """ - No duplicate removal - """ - input: - bam = os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}.filtered.sorted.bam" - ), - bai = os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}.filtered.sorted.bam.bai" - ), - - output: - bam = temp(os.path.join( - config["output_dir"], - "GN", - "remove_duplicates", - "{sample}.with_duplicates.bam" - )), - bai = temp(os.path.join( - config["output_dir"], - "GN", - "remove_duplicates", - "{sample}.with_duplicates.bam.bai" - )), - - singularity: - "docker://bash:5.0.16" - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{sample}", - "no_duplicate_collapse.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{sample}", - "no_duplicate_collapse.stderr.log" - ), - - shell: - "(cp {input.bam} {output.bam}; \ - cp {input.bai} {output.bai}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_umi_collapse: - """ - Duplicate removal when there are UMIs - """ - input: - bam = os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}.filtered.sorted.bam" - ), - bai = os.path.join( - config["output_dir"], - "GN", - "remove_ncRNAs", - "{sample}.filtered.sorted.bam.bai" - ), - - output: - bam = temp(os.path.join( - config["output_dir"], - "GN", - "remove_duplicates", - "{sample}.umis.bam" - )), - - params: - cluster_log_path = config["cluster_log"], - metrics_file = os.path.join( - config["output_dir"], - "GN", - "remove_duplicates", - "{sample}.umis.metrics" - ), - - singularity: - "docker://zavolab/umi-tools:0.5.4" - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{sample}", - "umi_collapse.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{sample}", - "umi_collapse.stderr.log" - ), - - shell: - "(umi_tools \ - dedup \ - -I {input.bam} \ - --paired \ - -S {output.bam} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_merge: - input: - bam = lambda wildcards: - expand( - os.path.join( - config["output_dir"], - "GN", - "remove_duplicates", - "{sample}.{dup_type}.bam"), - sample=config[wildcards.experiment][wildcards.name], - dup_type=config[config[wildcards.experiment][wildcards.name][0]]['dup_type'], - ), - output: - bam = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_{name}_merge_samples.bam" - )), - - params: - cluster_log_path = config["cluster_log"], - - singularity: - "docker://zavolab/samtools:1.8" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "merge_samples__{name}.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "merge_samples__{name}.stderr.log" - ), - - shell: - "(samtools merge \ - {output.bam} {input.bam}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_sort_merged: - input: - bam = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_{name}_merge_samples.bam" - ), - - output: - bam = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_{name}_merge_samples.sorted.bam" - ), - bai = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_{name}_merge_samples.sorted.bam.bai" - ), - - params: - cluster_log_path = config["cluster_log"], - prefix = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_{name}_merge_samples_temp") - - singularity: - "docker://zavolab/samtools:1.8" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "merge_samples__{name}.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "merge_samples__{name}.stderr.log" - ), - - shell: - "(samtools sort \ - -T {params.prefix} \ - -@ {threads} \ - {input.bam} > {output.bam}; \ - samtools index \ - {output.bam} {output.bai} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_read_frequencies: - input: - bam = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_{name}_merge_samples.sorted.bam" - ), - - output: - frequencies = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_{name}.frequencies.csv" - )), - - params: - cluster_log_path = config["cluster_log"], - paired = lambda wildcards: - config[config[wildcards.experiment][wildcards.name][0]]['paired'], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_bam_get_read_frequencies.py" - ), - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "read_frequencies__{name}.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "read_frequencies__{name}.stderr.log" - ), - - shell: - "(python {params.script} \ - --bamfile {input.bam} \ - --paired {params.paired} \ - --outfile {output.frequencies}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_bam_to_bed: - """ - Convert bamfile to bedfile using bedtools - """ - input: - bam = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_{name}_merge_samples.sorted.bam" - ), - - output: - bed = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_{name}_merge_samples.bed" - )), - - params: - cluster_log_path = config["cluster_log"], - - singularity: - "docker://zavolab/bedtools:2.27.0" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "bam_to_bed___{name}.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "bam_to_bed__{name}.stderr.log" - ), - - shell: - "(bedtools bamtobed \ - -i {input.bam} > {output.bed}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_genome_coverage: - """ - Obtain windows of read coverage genome - """ - input: - chromosome_info = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt" - ), - fg_bed = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_replicates_merge_samples.bed" - ), - bg_bed = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_smis_merge_samples.bed" - ), - fg_frequencies = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_replicates.frequencies.csv" - ), - bg_frequencies = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_smis.frequencies.csv" - ), - - output: - coverage = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}.gn_coverage.bed" - )), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "sliding_windows.py" - ), - output_dir = os.path.join( - config["output_dir"], - "GN", - "{experiment}"), - window_f = lambda wildcards: - config[wildcards.experiment]["window_f"], - window_b = lambda wildcards: - config[wildcards.experiment]["window_b"], - step_size = lambda wildcards: - config[wildcards.experiment]["step_size"], - sense_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['sense'], - sense_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['sense'], - paired_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['paired'], - paired_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['paired'], - prefix = "{experiment}.gn_coverage", - background_type = lambda wildcards: - config[wildcards.experiment]["background_type"], - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 12 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "genome_coverage.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "genome_coverage.stderr.log" - ), - - shell: - "(python {params.script} \ - --bed_foreground {input.fg_bed} \ - --bed_background {input.bg_bed} \ - --foreground_frequencies {input.fg_frequencies} \ - --background_frequencies {input.bg_frequencies} \ - --out {params.output_dir} \ - --prefix {params.prefix} \ - --chromosomes {input.chromosome_info} \ - --sense_f {params.sense_f} \ - --sense_b {params.sense_b} \ - --paired_f {params.paired_f} \ - --paired_b {params.paired_b} \ - --window_f {params.window_f} \ - --window_b {params.window_b} \ - --step_size {params.step_size} \ - --data_type genome \ - --background {params.background_type} \ - --cutoff 1 \ - --threads {threads}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_enriched_regions: - """ - Obtain enriched regions for binding based on EM using - bg for noise estimation. Modified and adapted from: - Berger, Severin M., et al. "Crunch: Integrated processing and modeling of - ChIP-seq data in terms of regulatory motifs." - Genome research (2019): gr-239319 - Custom script. - """ - input: - reads = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}.gn_coverage.bed" - ), - - output: - reads = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_enriched_regions.csv" - ), - parameters = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_parameters.csv" - ), - all_regions = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_total_regions.csv" - ), - - params: - cluster_log_path = config["cluster_log"], - plot_dir = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "enriched_regions_plots"), - fdr_cutoff = config['FDR'], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_find_enriched_regions.py" - ), - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "enriched_regions.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "enriched_regions.stderr.log" - ), - - shell: - "(mkdir -p {params.plot_dir}; \ - python {params.script} \ - --windows_table {input.reads} \ - --out {output.reads} \ - --out_parameters {output.parameters} \ - --all_regions {output.all_regions} \ - --out_folder {params.plot_dir} \ - --fdr_cutoff {params.fdr_cutoff} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_fit_peaks: - """ - Obtain peaks - """ - input: - fg_bam = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_replicates_merge_samples.sorted.bam" - ), - bg_bam = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_smis_merge_samples.sorted.bam" - ), - fg_frequencies = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_replicates.frequencies.csv" - ), - bg_frequencies = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_smis.frequencies.csv" - ), - reads = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_enriched_regions.csv" - ), - parameters = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_parameters.csv" - ), - chromosomes_g = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt" - ), - - output: - peaks = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_total_peaks.csv" - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_define_peaks.py" - ), - plot_dir = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "fit_peaks_plots"), - fragment_size = config["fragment_size"], - window_size = lambda wildcards: - config[wildcards.experiment]['window_f'], - paired_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['paired'], - paired_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['paired'], - sense_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['sense'], - sense_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['sense'], - background_type = lambda wildcards: - config[wildcards.experiment]["background_type"], - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 12 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "fit_peaks.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "fit_peaks.stderr.log" - ), - - shell: - "(mkdir -p {params.plot_dir}; \ - python {params.script} \ - --enriched_regions {input.reads} \ - --parameters {input.parameters} \ - --fragment_size {params.fragment_size} \ - --window_size {params.window_size} \ - --chromosomes_g {input.chromosomes_g} \ - --bam_foreground_g {input.fg_bam} \ - --bam_background_g {input.bg_bam} \ - --bam_fg_fq {input.fg_frequencies} \ - --bam_bg_fq {input.bg_frequencies} \ - --paired_f {params.paired_f} \ - --paired_b {params.paired_b} \ - --sense_f {params.sense_f} \ - --sense_b {params.sense_b} \ - --background {params.background_type} \ - --out {output.peaks} \ - --out_folder {params.plot_dir} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_peaks_split_sets_crosslink: - """ - Split peaks into training and test set - """ - input: - genome_fasta = config["genome"], - all_regions = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_total_regions.csv" - ), - reads = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_total_peaks.csv" - ), - chromosomes_g = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt" - ), - - output: - training = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink", - "training.fasta" - )), - test = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink", - "test.fasta" - )), - training_bg = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink", - "training_bg.fasta" - )), - test_bg = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink", - "test_bg.fasta" - )), - dirtemp = temp( - directory( - os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink", - "tmpdir") - ) - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_peaks_split_sets.py" - ), - output_dir = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink" - ), - genome_tag = config["genome_tag"], - peak_size = 50, - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "peaks_split_sets__{run}_crosslink.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "peaks_split_sets__{run}_crosslink.stderr.log" - ), - - shell: - "(mkdir -p {params.output_dir}; \ - mkdir -p {output.dirtemp}; \ - python {params.script} \ - --peaks_file {input.reads} \ - --genome_fasta {input.genome_fasta} \ - --all_regions {input.all_regions} \ - --genome_tag {params.genome_tag} \ - --tempdir {output.dirtemp} \ - --chromosomes_g {input.chromosomes_g} \ - --peak_size {params.peak_size} \ - --out_folder {params.output_dir} \ - --crosslink_type 'crosslink' \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_peaks_split_sets: - """ - Split peaks into training and test set - """ - input: - genome_fasta = config["genome"], - all_regions = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_total_regions.csv" - ), - reads = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_total_peaks.csv" - ), - chromosomes_g = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt" - ), - - output: - training = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center", - "training.fasta" - )), - test = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center", - "test.fasta" - )), - training_bg = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center", - "training_bg.fasta" - )), - test_bg = temp(os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center", - "test_bg.fasta" - )), - dirtemp = temp( - directory( - os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center", - "tmpdir") - ) - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_peaks_split_sets.py" - ), - output_dir = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center" - ), - genome_tag = config["genome_tag"], - peak_size = 50 - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "peaks_split_sets__{run}_peak_center.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "motif_analysis_crossvalidation", - "peaks_split_sets__{run}_peak_center.stderr.log" - ), - - shell: - "(mkdir -p {params.output_dir}; \ - mkdir -p {output.dirtemp}; \ - python {params.script} \ - --peaks_file {input.reads} \ - --genome_fasta {input.genome_fasta} \ - --all_regions {input.all_regions} \ - --genome_tag {params.genome_tag} \ - --tempdir {output.dirtemp} \ - --chromosomes_g {input.chromosomes_g} \ - --peak_size {params.peak_size} \ - --out_folder {params.output_dir} \ - --crosslink_type 'peak_center' \ - ) 1> {log.stdout} 2> {log.stderr}" - -rule GN_peaks_split_sets_crosslink_all_motifs: - """ - Get all peaks crosslink - """ - input: - genome_fasta = config["genome"], - all_regions = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_total_regions.csv" - ), - reads = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_total_peaks.csv" - ), - chromosomes_g = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt" - ), - - output: - test = temp( - os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_final", - "crosslink", - "test.fasta") - ), - test_bg = temp( - os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_final", - "crosslink", - "test_bg.fasta") - ), - dirtemp = temp( - directory( - os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_final", - "crosslink", - "tmpdir") - ) - ), - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "total_enrichment_peakstofasta.py" - ), - output_dir = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_final", - "crosslink", - ), - genome_tag = config["genome_tag"], - peak_size = 50, - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "motif_analysis_final", - "peaks_split_sets__all_motifs_crosslink.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "motif_analysis_final", - "peaks_split_sets__all_motifs_crosslink.stderr.log" - ), - - shell: - "(mkdir -p {params.output_dir}; \ - mkdir -p {output.dirtemp}; \ - python {params.script} \ - --peaks_file {input.reads} \ - --genome_fasta {input.genome_fasta} \ - --all_regions {input.all_regions} \ - --genome_tag {params.genome_tag} \ - --tempdir {output.dirtemp} \ - --chromosomes_g {input.chromosomes_g} \ - --peak_size {params.peak_size} \ - --out_folder {params.output_dir} \ - --crosslink_type 'crosslink' \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule GN_peaks_split_sets_all_motifs: - """ - get all peaks crosslink - """ - input: - genome_fasta = config["genome"], - all_regions = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_total_regions.csv" - ), - reads = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "{experiment}_total_peaks.csv" - ), - chromosomes_g = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt" - ), - - output: - test = temp( - os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_final", - "peak_center", - "test.fasta") - ), - test_bg = temp( - os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_final", - "peak_center", - "test_bg.fasta") - ), - dirtemp = temp( - directory( - os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_final", - "peak_center", - "tmpdir") - ) - ), - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "total_enrichment_peakstofasta.py" - ), - output_dir = os.path.join( - config["output_dir"], - "GN", - "{experiment}", - "motif_analysis_final", - "peak_center", - ), - genome_tag = config["genome_tag"], - peak_size = 50 - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "motif_analysis_final", - "peaks_split_sets__all_motifs_standard.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "GN", - "{experiment}", - "motif_analysis_final", - "peaks_split_sets__all_motifs_standard.stderr.log" - ), - - shell: - "(mkdir -p {params.output_dir}; \ - mkdir -p {output.dirtemp}; \ - python {params.script} \ - --peaks_file {input.reads} \ - --genome_fasta {input.genome_fasta} \ - --all_regions {input.all_regions} \ - --genome_tag {params.genome_tag} \ - --tempdir {output.dirtemp} \ - --chromosomes_g {input.chromosomes_g} \ - --peak_size {params.peak_size} \ - --out_folder {params.output_dir} \ - --crosslink_type 'peak_center' \ - ) 1> {log.stdout} 2> {log.stderr}" - diff --git a/workflow/rules/rcrunch_genomic_se.smk b/workflow/rules/rcrunch_genomic_se.smk new file mode 100644 index 0000000..4670e88 --- /dev/null +++ b/workflow/rules/rcrunch_genomic_se.smk @@ -0,0 +1,151 @@ +import os + +rule GN_map_STAR_se: + """ map reads to genome""" + input: + genome = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt" + ), + reads = lambda wildcards: + expand( + os.path.join( + config["output_dir"], + "cutadapt", + "{sample}_mate1.se.cutadapt_{format}.fastq.gz"), + sample=wildcards.sample, + format=config[wildcards.sample]['format'] + ), + output: + bamfile = os.path.join( + config["output_dir"], + "GN", + "alignment", + "{sample}", + "{sample}.se.bam" + ), + logfile = os.path.join( + config["output_dir"], + "GN", + "alignment", + "{sample}", + "{sample}.se.Log.final.out"), + params: + cluster_log_path = config["cluster_log"], + sample_id = "{sample}", + genome = os.path.join( + config["output_dir"], + "STAR_index" + ), + outFileNamePrefix = os.path.join( + config["output_dir"], + "GN", + "alignment", + "{sample}", + "{sample}.se.", + ), + multimappers = config['multimappers'] + shadow: "full" + + singularity: + "docker://zavolab/star:2.6.0a" + + threads: 12 + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{sample}", + "map_STAR.se.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{sample}", + "map_STAR.se.stderr.log" + ), + + shell: + "(STAR \ + --runMode alignReads \ + --runThreadN {threads} \ + --genomeDir {params.genome} \ + --readFilesIn {input.reads} \ + --readFilesCommand zcat \ + --outSAMunmapped None \ + --outFilterMultimapNmax {params.multimappers} \ + --outFilterMultimapScoreRange 1 \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --outSAMattributes All \ + --outStd BAM_Unsorted \ + --outSAMtype BAM Unsorted \ + --outFilterScoreMinOverLread 0.2 \ + --outFilterMatchNminOverLread 0.2 \ + --outFilterMismatchNoverLmax 0.1 \ + --outFilterType BySJout \ + --outReadsUnmapped Fastx \ + --outSAMattrRGline ID:rcrunch SM:{params.sample_id} \ + --alignEndsType EndToEnd > {output.bamfile} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule GN_umi_collapse_se: + """ + Duplicate removal when there are UMIs + """ + input: + bam = os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.filtered.sorted.bam" + ), + bai = os.path.join( + config["output_dir"], + "GN", + "remove_ncRNAs", + "{sample}.filtered.sorted.bam.bai" + ), + + output: + bam = temp(os.path.join( + config["output_dir"], + "GN", + "remove_duplicates", + "{sample}.umis.se.bam" + )), + + params: + cluster_log_path = config["cluster_log"], + metrics_file = os.path.join( + config["output_dir"], + "GN", + "remove_duplicates", + "{sample}.umis.se.metrics" + ), + + singularity: + "docker://zavolab/umi-tools:0.5.4" + + log: + stdout = os.path.join( + config["local_log"], + "GN", + "{sample}", + "umi_collapse.se.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "GN", + "{sample}", + "umi_collapse.se.stderr.log" + ), + + shell: + "(umi_tools \ + dedup \ + -I {input.bam} \ + -S {output.bam} \ + ) 1> {log.stdout} 2> {log.stderr}" diff --git a/workflow/rules/rcrunch_preprocessing.smk b/workflow/rules/rcrunch_preprocessing.smk new file mode 100644 index 0000000..7a1fd3b --- /dev/null +++ b/workflow/rules/rcrunch_preprocessing.smk @@ -0,0 +1,111 @@ +import os +rule create_index: + """ + Create an index of the human genome + """ + input: + genome = config["genome"], + gtf = config["gtf"] + + output: + chromosome_info = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt" + ), + chromosomes_names = os.path.join( + config["output_dir"], + "STAR_index", + "chrName.txt" + ), + + params: + cluster_log_path = config["cluster_log"], + output_dir = os.path.join( + config["output_dir"], + "STAR_index" + ), + outFileNamePrefix = os.path.join( + config["output_dir"], + "STAR_index/STAR_" + ), + sjdbOverhang = config["sjdbOverhang"], + + singularity: + "docker://zavolab/star:2.6.0a" + + threads: 12 + + log: + stdout = os.path.join( + config["local_log"], + "preprocessing", + "STAR_index.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "preprocessing", + "STAR_index.stderr.log" + ), + + shell: + "(mkdir -p {params.output_dir}; \ + chmod -R 777 {params.output_dir}; \ + STAR \ + --runMode genomeGenerate \ + --sjdbOverhang {params.sjdbOverhang} \ + --genomeDir {params.output_dir} \ + --genomeFastaFiles {input.genome} \ + --runThreadN {threads} \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --sjdbGTFfile {input.gtf}) 1> {log.stdout} 2> {log.stderr}" + +rule create_csv_from_ensembl_gtf: + """ + Create a csv based on the gtf file of ensembl + """ + input: + gtf = config["gtf"] + + output: + output = os.path.join( + config["output_dir"], + "ensembl_csv", + "ensembl.csv"), + + flag = os.path.join( + config["output_dir"], + "ensembl_csv", + "done"), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_ensembl__gtf_to_csv.py" + ), + + singularity: + "docker://zavolab/rcrunch_python:1.0" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "preprocessing", + "ensembl_csv.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "preprocessing", + "ensembl_csv.stderr.log" + ), + + shell: + "(python {params.script} \ + --gtf_file {input.gtf} \ + --flag {output.flag} \ + --out_file {output.output}) 1> {log.stdout} 2> {log.stderr}" + diff --git a/workflow/rules/rcrunch_preprocessing_pe.smk b/workflow/rules/rcrunch_preprocessing_pe.smk index 054bdc6..bdbaac2 100644 --- a/workflow/rules/rcrunch_preprocessing_pe.smk +++ b/workflow/rules/rcrunch_preprocessing_pe.smk @@ -2,120 +2,8 @@ # Preprocessing subworkflow for paired-end data. First step of analysis # ________________________________________________________________________________ import os -rule create_index: - """ - Create an index of the human genome - """ - input: - genome = config["genome"], - gtf = config["gtf"] - - output: - chromosome_info = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt" - ), - chromosomes_names = os.path.join( - config["output_dir"], - "STAR_index", - "chrName.txt" - ), - - params: - cluster_log_path = config["cluster_log"], - output_dir = os.path.join( - config["output_dir"], - "STAR_index" - ), - outFileNamePrefix = os.path.join( - config["output_dir"], - "STAR_index/STAR_" - ), - sjdbOverhang = config["sjdbOverhang"], - - singularity: - "docker://zavolab/star:2.6.0a" - - threads: 12 - - log: - stdout = os.path.join( - config["local_log"], - "preprocessing", - "STAR_index.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "preprocessing", - "STAR_index.stderr.log" - ), - - shell: - "(mkdir -p {params.output_dir}; \ - chmod -R 777 {params.output_dir}; \ - STAR \ - --runMode genomeGenerate \ - --sjdbOverhang {params.sjdbOverhang} \ - --genomeDir {params.output_dir} \ - --genomeFastaFiles {input.genome} \ - --runThreadN {threads} \ - --outFileNamePrefix {params.outFileNamePrefix} \ - --sjdbGTFfile {input.gtf}) 1> {log.stdout} 2> {log.stderr}" - - -rule create_csv_from_ensembl_gtf: - """ - Create a csv based on the gtf file of ensembl - """ - input: - gtf = config["gtf"] - - output: - output = os.path.join( - config["output_dir"], - "ensembl_csv", - "ensembl.csv" - ), - flag = os.path.join( - config["output_dir"], - "ensembl_csv", - "done" - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_ensembl__gtf_to_csv.py" - ), - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "preprocessing", - "ensembl_csv.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "preprocessing", - "ensembl_csv.stderr.log" - ), - - shell: - "(python {params.script} \ - --gtf_file {input.gtf} \ - --flag {output.flag} \ - --out_file {output.output}) 1> {log.stdout} 2> {log.stderr}" - -rule cutadapt: +rule cutadapt_pe: """ Trim 3' and 5' adapters """ @@ -140,12 +28,12 @@ rule cutadapt: reads1 = os.path.join( config["output_dir"], "cutadapt", - "{sample}_mate1.cutadapt_standard.fastq.gz"), + "{sample}_mate1.pe.cutadapt_standard.fastq.gz"), reads2 = os.path.join( config["output_dir"], "cutadapt", - "{sample}_mate2.cutadapt_standard.fastq.gz") + "{sample}_mate2.pe.cutadapt_standard.fastq.gz") params: cluster_log_path = config["cluster_log"], @@ -159,12 +47,12 @@ rule cutadapt: stdout = os.path.join( config["local_log"], "preprocessing", - "cutadapt__{sample}.stdout.log" + "cutadapt__{sample}.pe.stdout.log" ), stderr = os.path.join( config["local_log"], "preprocessing", - "cutadapt__{sample}.stderr.log" + "cutadapt__{sample}.pe.stderr.log" ) @@ -186,7 +74,7 @@ rule cutadapt: {input.reads2}) 1> {log.stdout} 2> {log.stderr}" -rule umi_tools_format: +rule umi_tools_format_pe: """ Create a UMI-tools barcode collapse format for any ENCODE samples. """ @@ -194,24 +82,24 @@ rule umi_tools_format: reads1 = os.path.join( config["output_dir"], "cutadapt", - "{sample}_mate1.cutadapt_standard.fastq.gz" + "{sample}_mate1.pe.cutadapt_standard.fastq.gz" ), reads2 = os.path.join( config["output_dir"], "cutadapt", - "{sample}_mate2.cutadapt_standard.fastq.gz" + "{sample}_mate2.pe.cutadapt_standard.fastq.gz" ), output: reads1 = os.path.join( config["output_dir"], "cutadapt", - "{sample}_mate1.cutadapt_encode.fastq.gz" + "{sample}_mate1.pe.cutadapt_encode.fastq.gz" ), reads2 = os.path.join( config["output_dir"], "cutadapt", - "{sample}_mate2.cutadapt_encode.fastq.gz" + "{sample}_mate2.pe.cutadapt_encode.fastq.gz" ), params: @@ -233,12 +121,12 @@ rule umi_tools_format: stdout = os.path.join( config["local_log"], "preprocessing", - "umi_format__{sample}.stdout.log" + "umi_format__{sample}.pe.stdout.log" ), stderr = os.path.join( config["local_log"], "preprocessing", - "umi_format__{sample}.stderr.log" + "umi_format__{sample}.pe.stderr.log" ), shell: diff --git a/workflow/rules/rcrunch_preprocessing_se.smk b/workflow/rules/rcrunch_preprocessing_se.smk index 01504fb..2346aa4 100644 --- a/workflow/rules/rcrunch_preprocessing_se.smk +++ b/workflow/rules/rcrunch_preprocessing_se.smk @@ -2,115 +2,8 @@ # Preprocessing subworkflow for paired-end data. First step of analysis # ________________________________________________________________________________ import os -rule create_index: - """ - Create an index of the human genome - """ - input: - genome = config["genome"], - gtf = config["gtf"] - - output: - chromosome_info = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt"), - - chromosomes_names = os.path.join( - config["output_dir"], - "STAR_index", - "chrName.txt") - - params: - cluster_log_path = config["cluster_log"], - output_dir = os.path.join( - config["output_dir"], - "STAR_index"), - outFileNamePrefix = os.path.join( - config["output_dir"], - "STAR_index/STAR_"), - sjdbOverhang = config["sjdbOverhang"] - - singularity: - "docker://zavolab/star:2.6.0a" - - threads: 12 - - log: - stdout = os.path.join( - config["local_log"], - "preprocessing", - "STAR_index.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "preprocessing", - "STAR_index.stderr.log" - ), - - shell: - "(mkdir -p {params.output_dir}; \ - chmod -R 777 {params.output_dir}; \ - STAR \ - --runMode genomeGenerate \ - --sjdbOverhang {params.sjdbOverhang} \ - --genomeDir {params.output_dir} \ - --genomeFastaFiles {input.genome} \ - --runThreadN {threads} \ - --outFileNamePrefix {params.outFileNamePrefix} \ - --sjdbGTFfile {input.gtf}) 1> {log.stdout} 2> {log.stderr}" - - -rule create_csv_from_ensembl_gtf: - """ - Create a csv based on the gtf file of ensembl - """ - input: - gtf = config["gtf"] - - output: - output = os.path.join( - config["output_dir"], - "ensembl_csv", - "ensembl.csv"), - - flag = os.path.join( - config["output_dir"], - "ensembl_csv", - "done"), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_ensembl__gtf_to_csv.py" - ), - - singularity: - "docker://zavolab/rcrunch_python:1.0" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "preprocessing", - "ensembl_csv.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "preprocessing", - "ensembl_csv.stderr.log" - ), - - shell: - "(python {params.script} \ - --gtf_file {input.gtf} \ - --flag {output.flag} \ - --out_file {output.output}) 1> {log.stdout} 2> {log.stderr}" -rule cutadapt: +rule cutadapt_se: """ Trim 3' and 5' adapters """ @@ -124,8 +17,8 @@ rule cutadapt: output: reads = os.path.join( config["output_dir"], - "cutadapt_replicates", - "{sample}.cutadapt_standard.fastq.gz") + "cutadapt", + "{sample}_mate1.se.cutadapt_standard.fastq.gz") params: cluster_log_path = config["cluster_log"], @@ -139,16 +32,16 @@ rule cutadapt: stdout = os.path.join( config["local_log"], "preprocessing", - "cutadapt__{sample}.stdout.log" + "cutadapt__{sample}.se.stdout.log" ), stderr = os.path.join( config["local_log"], "preprocessing", - "cutadapt__{sample}.stderr.log" + "cutadapt__{sample}.se.stderr.log" ) shell: - "cutadapt \ + "(cutadapt \ -e 0.1 \ -O 1 \ -j {threads} \ @@ -160,21 +53,21 @@ rule cutadapt: {input.reads}) 1> {log.stdout} 2> {log.stderr}" -rule umi_tools_format: +rule umi_tools_format_se: """ Create a UMI-tools barcode collapse format for any ENCODE samples. """ input: reads = os.path.join( config["output_dir"], - "cutadapt_replicates2", - "{sample}.cutadapt_standard.fastq.gz") + "cutadapt", + "{sample}_mate1.se.cutadapt_standard.fastq.gz") output: reads = os.path.join( config["output_dir"], - "cutadapt_replicates2", - "{sample}.cutadapt_encode.fastq.gz") + "cutadapt", + "{sample}_mate1.se.cutadapt_encode.fastq.gz") params: cluster_log_path = config["cluster_log"], script = os.path.join( @@ -186,7 +79,7 @@ rule umi_tools_format: config[wildcards.sample]['format'] singularity: - "docker://zavolab/rcrunch_python:1.0" + "docker://zavolab/rcrunch_python:1.0.5" threads: 1 @@ -194,12 +87,12 @@ rule umi_tools_format: stdout = os.path.join( config["local_log"], "preprocessing", - "umi_format__{sample}.stdout.log" + "umi_format__{sample}.se.stdout.log" ), stderr = os.path.join( config["local_log"], "preprocessing", - "umi_format__{sample}.stderr.log" + "umi_format__{sample}.se.stderr.log" ), shell: "(python {params.script} \ diff --git a/workflow/rules/rcrunch_transcriptomic.smk b/workflow/rules/rcrunch_transcriptomic.smk new file mode 100644 index 0000000..b153535 --- /dev/null +++ b/workflow/rules/rcrunch_transcriptomic.smk @@ -0,0 +1,2349 @@ +# -------------------------------------------------------------------------------- +# RCRUNCH +# Author : Katsantoni Maria +# Company: Mihaela Zavolan group, Biozentrum, Basel +# -------------------------------------------------------------------------------- +# Transcriptomic RCRUNCH subpipeline for CLIP data +# This approach is based on the hypothesis that the RBP is binding the mature +# mRNA and thus the sequence of the mature mRNA is important for the binding. +# ________________________________________________________________________________ +import os + +rule TR_initial_index: + """ + Index genome bamfile using samtools. + """ + input: + bam = lambda wildcards: expand( + os.path.join( + config["output_dir"], + "TR", + "initial_map", + "{sample}", + "{sample}.{mates}.bam"), + sample=wildcards.sample, + mates=get_mates(wildcards.sample)), + log = lambda wildcards: expand( + os.path.join( + config["output_dir"], + "TR", + "initial_map", + "{sample}", + "{sample}.{mates}.Log.final.out"), + sample=wildcards.sample, + mates=get_mates(wildcards.sample)) + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "initial_map", + "{sample}.sorted.bam") + ), + bai = temp( + os.path.join( + config["output_dir"], + "TR", + "initial_map", + "{sample}.sorted.bam.bai") + ), + + params: + cluster_log_path = config["cluster_log"], + prefix = os.path.join( + config["output_dir"], + "TR", + "initial_map", + "{sample}_temp"), + + singularity: + "docker://zavolab/samtools:1.8" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_initial_index.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_initial_index.stderr.log"), + + shell: + "(samtools sort \ + -T {params.prefix} \ + -@ {threads} \ + {input.bam} > {output.bam}; \ + samtools index \ + {output.bam} {output.bai}) \ + 1> {log.stdout} 2> {log.stderr}" + + +rule TR_flag_ncRNA_reads: + """ + Make txt file of reads mapping to ncRNAs. + If a multimapper maps to a ncRNA, all reads are excluded. + Custom script. + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "initial_map", + "{sample}.sorted.bam"), + bai = os.path.join( + config["output_dir"], + "TR", + "initial_map", + "{sample}.sorted.bam.bai"), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_filter_ncRNAs.py"), + ncRNAs = config['ncRNAs'], + ncRNA_biotypes = expand(config['ncRNA_biotypes']), + paired = lambda wildcards: + config[wildcards.sample]['paired'], + sense = lambda wildcards: + config[wildcards.sample]['sense'], + + output: + outfile = temp( + os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}.ncrna.txt") + ), + flag = temp( + os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}_done") + ), + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_flag_ncRNA_reads.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_flag_ncRNA_reads.stderr.log"), + + shell: + "(python {params.script} \ + --bamfile {input.bam} \ + --RNA_central {params.ncRNAs} \ + --outfile {output.outfile} \ + --paired {params.paired} \ + --sense {params.sense} \ + --ncRNAs {params.ncRNA_biotypes} \ + --flag {output.flag} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_remove_ncRNA_reads: + """ + Remove the reads mapping to rRNAs using picard. + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "initial_map", + "{sample}.sorted.bam"), + read_names = os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}.ncrna.txt"), + + output: + reads = temp( + os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}.filtered.bam") + ), + + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://zavolab/picard:2.18.9" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_remove_ncRNA_reads.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_remove_ncRNA_reads.stderr.log"), + + shell: + "(java -jar /usr/local/bin/picard.jar FilterSamReads \ + I={input.bam} \ + O={output.reads} \ + READ_LIST_FILE={input.read_names} \ + FILTER=excludeReadList) \ + 1> {log.stdout} 2> {log.stderr}" + + +rule TR_index_ncRNA_rm: + """ + Index bamfile without the rRNA-mapped reads using samtools. + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}.filtered.bam"), + + output: + bai = temp( + os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}.filtered.bam.bai") + ), + + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://zavolab/samtools:1.8" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_index_ncRNA_rm.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_index_ncRNA_rm.stderr.log"), + + shell: + "(samtools index \ + {input.bam} {output.bai} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_flag_duplicates: + """ + Flag duplicate reads using the STAR flags. + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}.filtered.bam"), + bai = os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}.filtered.bam.bai"), + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "flag_duplicates", + "{sample}.duplicates.Processed.out.bam") + ), + + params: + cluster_log_path = config["cluster_log"], + outFileNamePrefix = os.path.join( + config["output_dir"], + "TR", + "flag_duplicates", + "{sample}.duplicates."), + + singularity: + "docker://zavolab/star:2.6.0a" + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_flag_duplicates.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_flag_duplicates.stderr.log"), + + shell: + "(STAR \ + --inputBAMfile {input.bam} \ + --bamRemoveDuplicatesType UniqueIdenticalNotMulti \ + --runMode inputAlignmentsFromBAM \ + --outFileNamePrefix {params.outFileNamePrefix} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_remove_duplicates: + """ + Remove reads flagged as duplicates by STAR. + Custom script. + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "flag_duplicates", + "{sample}.duplicates.Processed.out.bam"), + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "remove_duplicates", + "{sample}.duplicates.bam") + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_filter_duplicates.py"), + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_remove_duplicates.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_remove_duplicates.stderr.log"), + + shell: + "(python {params.script} \ + --bamfile {input.bam} \ + --outfile {output.bam} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_no_duplicate_removal: + """ + No removal of duplicates + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}.filtered.bam"), + bai = os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}.filtered.bam.bai"), + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "remove_duplicates", + "{sample}.with_duplicates.bam") + ), + bai = temp( + os.path.join( + config["output_dir"], + "TR", + "remove_duplicates", + "{sample}.with_duplicates.bam.bai") + ), + + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://bash:5.0.16" + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_no_duplicate_removal.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_no_duplicate_removal.stderr.log"), + + shell: + "(cp {input.bam} {output.bam}; \ + cp {input.bai} {output.bai}; \ + ) 1> {log.stdout} 2> {log.stderr}" + +rule TR_merge: + input: + bam = lambda wildcards: + expand( + os.path.join( + config["output_dir"], + "TR", + "remove_duplicates", + "{sample}.{dup_type}.{mates}.bam"), + sample=config[wildcards.experiment][wildcards.name], + dup_type=config[config[wildcards.experiment][wildcards.name][0]]['dup_type'], + mates=get_mates(config[wildcards.experiment][wildcards.name][0])), + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_merge_samples.bam") + ), + + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://zavolab/samtools:1.8" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_merge.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_merge.stderr.log"), + + shell: + "(samtools merge \ + {output.bam} {input.bam}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_sort_merged: + input: + bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_merge_samples.bam" + ), + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_merge_samples.sorted.bam") + ), + bai = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_merge_samples.sorted.bam.bai") + ), + + params: + cluster_log_path = config["cluster_log"], + prefix = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{name}_merge_samples"), + + singularity: + "docker://zavolab/samtools:1.8" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_sort_merged.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_sort_merged.stderr.log"), + + shell: + "(samtools sort \ + -T {params.prefix} \ + -@ {threads} \ + {input.bam} > {output.bam}; \ + samtools index \ + {output.bam} {output.bai} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_sortname_merged: + input: + bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_merge_samples.bam"), + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_merge_samples.sortedname.bam") + ), + + params: + cluster_log_path = config["cluster_log"], + prefix = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{name}_merge_samples_sortname"), + + singularity: + "docker://zavolab/samtools:1.8" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_sortname_merged.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_sortname_merged.stderr.log"), + + shell: + "(samtools sort -n \ + -T {params.prefix} \ + -@ {threads} \ + {input.bam} > {output.bam}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_read_frequencies: + """ + Calculate occurence of reads that are multimappers. + Custom script. + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_merge_samples.sorted.bam"), + bai = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_merge_samples.sorted.bam.bai"), + + output: + frequencies = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.frequencies.csv") + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_bam_get_read_frequencies.py"), + paired = lambda wildcards: + config[config[wildcards.experiment][wildcards.name][0]]['paired'], + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_read_frequencies.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_read_frequencies.stderr.log"), + + shell: + "(python {params.script} \ + --bamfile {input.bam} \ + --paired {params.paired} \ + --outfile {output.frequencies}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + + +rule TR_salmon_index: + """ + Build salmon index based on transcript annotation. + Required for transcript quantification done by salmon. + """ + input: + transcriptome = config["transcriptome"] + + output: + index = directory(os.path.join( + config["output_dir"], + "TR_salmon_index") + ), + + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://zavolab/salmon:0.11.0" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "salmon_index.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "salmon_index.stderr.log"), + + shell: + "(salmon index \ + -t {input.transcriptome} \ + -i {output.index} \ + --type fmd; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_customised_transcriptome: + """ + Build customised transcriptome keeping + the most expressed isoform per gene. + Custom script. + """ + input: + transcriptome = config["transcriptome"], + ensembl_csv = os.path.join( + config["output_dir"], + "ensembl_csv", + "ensembl.csv"), + ensembl_gtf = config["gtf"], + quantification = lambda wildcards: expand( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_quantification_{mates}_{name}", + "quant.sf"), + experiment=wildcards.experiment, + name=wildcards.name, + mates=get_mates(config[wildcards.experiment][wildcards.name][0])), + output: + output_gtf = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.transcriptome.gtf"), + output_fasta = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.transcriptome.fa"), + info = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.info.csv"), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_most_expressed_transcriptome.py"), + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_customised_transcriptome.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_customised_transcriptome.stderr.log"), + + shell: + "(python {params.script} \ + --quantification_file {input.quantification} \ + --transcriptome {input.transcriptome} \ + --ensembl_csv {input.ensembl_csv} \ + --ensembl_gtf {input.ensembl_gtf} \ + --out_gtf {output.output_gtf} \ + --transcript_info {output.info} \ + --out_fasta {output.output_fasta} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_transcriptome_index: + """ + Create index of the customised transcriptome using STAR. + """ + input: + tr_fasta = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.transcriptome.fa"), + tr_gtf = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.transcriptome.gtf"), + + output: + output = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{name}_STAR_index", + "chrNameLength.txt"), + + params: + cluster_log_path = config["cluster_log"], + output_dir = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{name}_STAR_index"), + outFileNamePrefix = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{name}_STAR_index/STAR_"), + sjdbOverhang = config["sjdbOverhang"] + + singularity: + "docker://zavolab/star:2.6.0a" + + threads: 12 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_transcriptome_index.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_transcriptome_index.stderr.log"), + + shell: + "(chmod -R 777 {params.output_dir}; \ + STAR \ + --runMode genomeGenerate \ + --genomeSAindexNbases 12 \ + --genomeChrBinNbits 12 \ + --genomeDir {params.output_dir} \ + --genomeFastaFiles {input.tr_fasta} \ + --runThreadN {threads} \ + --outFileNamePrefix {params.outFileNamePrefix} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_index_map_to_transcriptome: + """ + Index the transcriptomic alignment. + """ + input: + bam = lambda wildcards: expand( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_map", + "{name}", + "transcriptome.{mates}.bam" + ), + experiment=wildcards.experiment, + name=wildcards.name, + mates=get_mates(config[wildcards.experiment][wildcards.name][0])), + log = lambda wildcards: expand( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_map", + "{name}", + "transcriptome.{mates}.Log.final.out" + ), + experiment=wildcards.experiment, + name=wildcards.name, + mates=get_mates(config[wildcards.experiment][wildcards.name][0])), + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_map_sort", + "{name}", + "transcriptome.sorted.bam") + ), + bai = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_map_sort", + "{name}", + "transcriptome.sorted.bam.bai") + ), + + params: + cluster_log_path = config["cluster_log"], + prefix = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_map", + "{name}", + "transcriptome_temp"), + + singularity: + "docker://zavolab/samtools:1.8" + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_index_map_to_transcriptome.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_index_map_to_transcriptome.stderr.log"), + + shell: + "(samtools sort \ + -T {params.prefix} \ + -@ {threads} \ + {input.bam} > {output.bam}; \ + samtools index \ + {output.bam} {output.bai} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_index_map_to_genome: + """ + Index the genomic alignment. + """ + input: + bam = lambda wildcards: expand( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map", + "{name}", + "genome.{mates}.bam"), + experiment=wildcards.experiment, + name=wildcards.name, + mates=get_mates(config[wildcards.experiment][wildcards.name][0])), + log = lambda wildcards: expand( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map", + "{name}", + "genome.{mates}.Log.final.out"), + experiment=wildcards.experiment, + name=wildcards.name, + mates=get_mates(config[wildcards.experiment][wildcards.name][0])), + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map_sort", + "{name}", + "genome.sorted.bam") + ), + bai = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map_sort", + "{name}", + "genome.sorted.bam.bai") + ), + params: + cluster_log_path = config["cluster_log"], + prefix = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map_sort", + "{name}", + "genome_temp"), + + singularity: + "docker://zavolab/samtools:1.8" + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_index_map_to_genome.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_index_map_to_genome.stderr.log"), + + shell: + "(samtools sort \ + -T {params.prefix} \ + -@ {threads} \ + {input.bam} > {output.bam}; \ + samtools index \ + {output.bam} {output.bai} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_assign_to_gn_tr: + """ + Remove reads that map to '-' strand in transcriptome + """ + input: + ensembl_csv = os.path.join( + config["output_dir"], + "ensembl_csv", + "ensembl.csv"), + chromosome_info = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt"), + gn_bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map_sort", + "{name}", + "genome.sorted.bam"), + gn_bai = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map_sort", + "{name}", + "genome.sorted.bam.bai"), + tr_bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_map_sort", + "{name}", + "transcriptome.sorted.bam"), + tr_bai = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_map_sort", + "{name}", + "transcriptome.sorted.bam.bai"), + + output: + tr_bam = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_transcriptome.bam") + ), + gn_bam = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_genome.bam") + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_assign_read_to_gn_tr.py"), + out_folder = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{name}_assign_to_gn_tr_temp"), + paired = lambda wildcards: + config[config[wildcards.experiment][wildcards.name][0]]['paired'], + sense = lambda wildcards: + config[config[wildcards.experiment][wildcards.name][0]]['sense'], + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 12 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_assign_to_gn_tr.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_assign_to_gn_tr.stderr.log"), + + shell: + "(mkdir -p {params.out_folder} ;\ + python {params.script} \ + --annotation {input.ensembl_csv} \ + --tr_bam {input.tr_bam} \ + --gn_bam {input.gn_bam} \ + --out_folder {params.out_folder} \ + --filtered_tr_bam {output.tr_bam} \ + --filtered_gn_bam {output.gn_bam} \ + --paired {params.paired} \ + --sense {params.sense} \ + --chromosome_info {input.chromosome_info} \ + --threads {threads} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_transcriptome_filter_sort: + """ + Sort and index transcriptomic reads after + filtering wrong strand reads and + those that map better to genome + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_transcriptome.bam"), + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_transcriptome.sorted.bam") + ), + bai = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_transcriptome.sorted.bam.bai") + ), + + params: + cluster_log_path = config["cluster_log"], + prefix = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{name}_sort_samples"), + + singularity: + "docker://zavolab/samtools:1.8" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_transcriptome_filter_sort.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_transcriptome_filter_sort.stderr.log"), + + shell: + "(samtools sort \ + -T {params.prefix} \ + -@ {threads} \ + {input.bam} > {output.bam}; \ + samtools index \ + {output.bam} {output.bai} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_genome_filter_sort: + """ + Sort and index genomic reads after filtering wrong strand reads and + those that map better to genome + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_genome.bam"), + + output: + bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_genome.sorted.bam"), + bai = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_genome.sorted.bam.bai"), + + params: + cluster_log_path = config["cluster_log"], + prefix = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{name}_gn_sort_samples"), + + singularity: + "docker://zavolab/samtools:1.8" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_genome_filter_sort.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_genome_filter_sort.stderr.log"), + + shell: + "(samtools sort \ + -T {params.prefix} \ + -@ {threads} \ + {input.bam} > {output.bam}; \ + samtools index \ + {output.bam} {output.bai} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_bam_to_bed: + """ + Convert bamfile to bedfile using bedtools + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_genome.sorted.bam"), + + output: + bed = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_genome.bed") + ), + + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://zavolab/bedtools:2.27.0" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_bam_to_bed.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_bam_to_bed.stderr.log"), + + shell: + "(bedtools bamtobed \ + -i {input.bam} > {output.bed}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_bam_to_bed_transcriptome: + """ + Convert bamfile to bedfile using bedtools + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_transcriptome.sorted.bam"), + + output: + bed = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_transcriptome.bed") + ), + + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://zavolab/bedtools:2.27.0" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_bam_to_bed_transcriptome.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_bam_to_bed_transcriptome.stderr.log"), + + shell: + "(bedtools bamtobed \ + -i {input.bam} > {output.bed}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_genome_coverage: + """ + Count number of fragments in windows + of specific size at the genome. + Custom script. + """ + input: + chromosome_info = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt"), + fg_bed = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates_genome.bed"), + bg_bed = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_smis_genome.bed"), + fg_frequencies = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates.frequencies.csv"), + bg_frequencies = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_smis.frequencies.csv"), + + output: + coverage = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}.genome_coverage.bed") + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "sliding_windows.py"), + output_dir = os.path.join( + config["output_dir"], + "TR", + "{experiment}"), + window_f = lambda wildcards: + config[wildcards.experiment]["window_f"], + window_b = lambda wildcards: + config[wildcards.experiment]["window_b"], + step_size = lambda wildcards: + config[wildcards.experiment]["step_size"], + sense_f = lambda wildcards: + config[config[wildcards.experiment]["replicates"][0]]['sense'], + sense_b = lambda wildcards: + config[config[wildcards.experiment]["smis"][0]]['sense'], + paired_f = lambda wildcards: + config[config[wildcards.experiment]["replicates"][0]]['paired'], + paired_b = lambda wildcards: + config[config[wildcards.experiment]["smis"][0]]['paired'], + prefix = "{experiment}.genome_coverage", + background_type = lambda wildcards: + config[wildcards.experiment]["background_type"], + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "genome_coverage.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "genome_coverage.stderr.log"), + + shell: + "(python {params.script} \ + --bed_foreground {input.fg_bed} \ + --bed_background {input.bg_bed} \ + --foreground_frequencies {input.fg_frequencies} \ + --background_frequencies {input.bg_frequencies} \ + --out {params.output_dir} \ + --prefix {params.prefix} \ + --chromosomes {input.chromosome_info} \ + --sense_f {params.sense_f} \ + --sense_b {params.sense_b} \ + --paired_f {params.paired_f} \ + --paired_b {params.paired_b} \ + --window_f {params.window_f} \ + --window_b {params.window_b} \ + --step_size {params.step_size} \ + --data_type genome \ + --background {params.background_type} \ + --cutoff 1 \ + --threads {threads}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_transcriptome_coverage: + """ + Count number of fragments in windows + of specific size at + the customised transcriptome. + Custom script. + """ + input: + chromosome_info = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "replicates_STAR_index", + "chrNameLength.txt"), + fg_bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates_transcriptome.sorted.bam"), + fg_bai = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates_transcriptome.sorted.bam.bai"), + bg_bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_smis_transcriptome.sorted.bam"), + bg_bai = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_smis_transcriptome.sorted.bam.bai"), + fg_frequencies = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates.frequencies.csv"), + bg_frequencies = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_smis.frequencies.csv"), + + output: + coverage = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}.transcriptome_coverage.bed") + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_bam_count_sliding_windows.py"), + output_dir = os.path.join( + config["output_dir"], + "TR", + "{experiment}"), + window_f = lambda wildcards: + config[wildcards.experiment]["window_f"], + window_b = lambda wildcards: + config[wildcards.experiment]["window_b"], + step_size = lambda wildcards: + config[wildcards.experiment]["step_size"], + sense_f = lambda wildcards: + config[config[wildcards.experiment]["replicates"][0]]['sense'], + sense_b = lambda wildcards: + config[config[wildcards.experiment]["smis"][0]]['sense'], + paired_f = lambda wildcards: + config[config[wildcards.experiment]["replicates"][0]]['paired'], + paired_b = lambda wildcards: + config[config[wildcards.experiment]["smis"][0]]['paired'], + prefix = "{experiment}.transcriptome_coverage", + background_type = lambda wildcards: + config[wildcards.experiment]["background_type"] + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "transcriptome_coverage.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "transcriptome_coverage.stderr.log"), + + shell: + "(python {params.script} \ + --bam_foreground {input.fg_bam} \ + --bam_background {input.bg_bam} \ + --bam_foreground_frequencies {input.fg_frequencies} \ + --bam_background_frequencies {input.bg_frequencies} \ + --out {params.output_dir} \ + --prefix {params.prefix} \ + --chromosomes {input.chromosome_info} \ + --sense_f {params.sense_f} \ + --sense_b {params.sense_b} \ + --paired_f {params.paired_f} \ + --paired_b {params.paired_b} \ + --window_f {params.window_f} \ + --window_b {params.window_b} \ + --step_size {params.step_size} \ + --data_type transcriptome \ + --background {params.background_type} \ + --cutoff 1 \ + --threads {threads}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_merge_windows: + """ + Merge windows from foreground/ background. + Custom script. + """ + input: + tr_coverage = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}.transcriptome_coverage.bed"), + gn_coverage = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}.genome_coverage.bed"), + + output: + merged_coverage = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}.total_coverage.bed") + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_merge_fg_bg.py"), + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "merge_windows.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "merge_windows.stderr.log"), + + shell: + "(python {params.script} \ + --transcriptome {input.tr_coverage} \ + --genome {input.gn_coverage} \ + --out {output.merged_coverage} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_enriched_regions: + """ + Obtain enriched regions for binding based on EM using + bg for noise estimation. Modified and adapted from: + Berger, Severin M., et al. "Crunch: Integrated processing and modeling of + ChIP-seq data in terms of regulatory motifs." + Genome research (2019): gr-239319 + Custom script. + """ + input: + coverage = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}.total_coverage.bed"), + + output: + enriched_regions = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_enriched_regions.csv"), + parameters = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_parameters.csv"), + all_regions = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_total_regions.csv"), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_find_enriched_regions.py"), + plot_dir = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "enriched_regions_plots"), + fdr_cutoff = config['FDR'] + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "enriched_regions.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "enriched_regions.stderr.log"), + + shell: + "(mkdir -p {params.plot_dir}; \ + python {params.script} \ + --windows_table {input.coverage} \ + --out {output.enriched_regions} \ + --all_regions {output.all_regions} \ + --out_parameters {output.parameters} \ + --out_folder {params.plot_dir} \ + --fdr_cutoff {params.fdr_cutoff} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_fit_peaks: + """ + Fit individual binding events in the enriched regions. + Modifed and adapted from: + Berger, Severin M., et al. "Crunch: Integrated processing and + modeling of ChIP-seq data in terms of regulatory motifs." + Genome research (2019): gr-239319. + Custom script. + """ + input: + fg_gn_bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates_genome.sorted.bam"), + fg_gn_bai = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates_genome.sorted.bam.bai"), + bg_gn_bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_smis_genome.sorted.bam"), + bg_gn_bai = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_smis_genome.sorted.bam.bai"), + fg_tr_bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates_transcriptome.sorted.bam"), + fg_tr_bai = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates_transcriptome.sorted.bam.bai"), + bg_tr_bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_smis_transcriptome.sorted.bam"), + bg_tr_bai = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_smis_transcriptome.sorted.bam.bai"), + fg_frequencies = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates.frequencies.csv"), + bg_frequencies = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_smis.frequencies.csv"), + enriched_regions = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_enriched_regions.csv"), + parameters = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_parameters.csv"), + chromosomes_g = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt"), + chromosomes_t = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "replicates_STAR_index", + "chrNameLength.txt"), + + output: + peaks = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_total_peaks.csv"), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_define_peaks.py"), + plot_dir = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "fit_peaks_plots"), + fragment_size = config["fragment_size"], + window_size = lambda wildcards: + config[wildcards.experiment]['window_f'], + paired_f = lambda wildcards: + config[config[wildcards.experiment]["replicates"][0]]['paired'], + paired_b = lambda wildcards: + config[config[wildcards.experiment]["smis"][0]]['paired'], + sense_f = lambda wildcards: + config[config[wildcards.experiment]["replicates"][0]]['sense'], + sense_b = lambda wildcards: + config[config[wildcards.experiment]["smis"][0]]['sense'], + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 12 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "fit_peaks.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "fit_peaks.stderr.log"), + + shell: + "(mkdir -p {params.plot_dir}; \ + python {params.script} \ + --enriched_regions {input.enriched_regions} \ + --parameters {input.parameters} \ + --fragment_size {params.fragment_size} \ + --window_size {params.window_size} \ + --chromosomes_g {input.chromosomes_g} \ + --bam_foreground_g {input.fg_gn_bam} \ + --bam_background_g {input.bg_gn_bam} \ + --chromosomes_t {input.chromosomes_t} \ + --bam_foreground_t {input.fg_tr_bam} \ + --bam_background_t {input.bg_tr_bam} \ + --bam_fg_fq {input.fg_frequencies} \ + --bam_bg_fq {input.bg_frequencies} \ + --paired_f {params.paired_f} \ + --paired_b {params.paired_b} \ + --sense_f {params.sense_f} \ + --sense_b {params.sense_b} \ + --out {output.peaks} \ + --out_folder {params.plot_dir} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_peaks_split_sets_crosslink: + """ + Split the significant peaks into training and test + for subsequent motif analysis + """ + input: + genome_fasta = config["genome"], + transcriptome_fasta = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates.transcriptome.fa"), + all_regions = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_total_regions.csv"), + reads = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_total_peaks.csv"), + chromosomes_g = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt"), + chromosomes_t = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "replicates_STAR_index", + "chrNameLength.txt"), + + output: + training = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink", + "training.fasta") + ), + test = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink", + "test.fasta") + ), + training_bg = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink", + "training_bg.fasta") + ), + test_bg = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink", + "test_bg.fasta") + ), + dirtemp = temp( + directory( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink", + "tmpdir") + ) + ), + + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_peaks_split_sets.py"), + output_dir = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "crosslink"), + genome_tag = config["genome_tag"], + peak_size = 50 + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}_peaks_split_sets_crosslink.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}_peaks_split_sets_crosslink.stderr.log"), + + shell: + "(mkdir -p {output.dirtemp}; \ + python {params.script} \ + --peaks_file {input.reads} \ + --genome_fasta {input.genome_fasta} \ + --all_regions {input.all_regions} \ + --genome_tag {params.genome_tag} \ + --tempdir {output.dirtemp} \ + --transcriptome_fasta {input.transcriptome_fasta} \ + --chromosomes_g {input.chromosomes_g} \ + --chromosomes_t {input.chromosomes_t} \ + --crosslink_type 'crosslink' \ + --peak_size {params.peak_size} \ + --out_folder {params.output_dir} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_peaks_split_sets: + """ + Split the significant peaks into training and test + for subsequent motif analysis + """ + input: + genome_fasta = config["genome"], + transcriptome_fasta = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates.transcriptome.fa"), + all_regions = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_total_regions.csv"), + reads = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_total_peaks.csv"), + chromosomes_g = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt"), + chromosomes_t = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "replicates_STAR_index", + "chrNameLength.txt"), + + output: + training = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center", + "training.fasta") + ), + test = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center", + "test.fasta") + ), + training_bg = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center", + "training_bg.fasta") + ), + test_bg = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center", + "test_bg.fasta") + ), + dirtemp = temp( + directory( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center", + "tmpdir") + ) + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_peaks_split_sets.py"), + output_dir = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}", + "peak_center"), + genome_tag = config["genome_tag"], + peak_size = 50 + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}_peaks_split_sets_peak_center.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "motif_analysis_crossvalidation", + "{run}_peaks_split_sets_peak_center.stderr.log"), + + shell: + "(mkdir -p {output.dirtemp}; \ + python {params.script} \ + --peaks_file {input.reads} \ + --genome_fasta {input.genome_fasta} \ + --all_regions {input.all_regions} \ + --tempdir {output.dirtemp} \ + --genome_tag {params.genome_tag} \ + --transcriptome_fasta {input.transcriptome_fasta} \ + --chromosomes_g {input.chromosomes_g} \ + --chromosomes_t {input.chromosomes_t} \ + --crosslink_type 'peak_center' \ + --out_folder {params.output_dir} \ + ) 1> {log.stdout} 2> {log.stderr}" + +rule TR_peaks_split_sets_crosslink_all_motifs: + """ + Get all peaks crosslink + """ + input: + transcriptome_fasta = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates.transcriptome.fa"), + genome_fasta = config["genome"], + all_regions = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_total_regions.csv" + ), + reads = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_total_peaks.csv" + ), + chromosomes_g = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt" + ), + chromosomes_t = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "replicates_STAR_index", + "chrNameLength.txt"), + + output: + test = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_final", + "crosslink", + "test.fasta") + ), + test_bg = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_final", + "crosslink", + "test_bg.fasta") + ), + dirtemp = temp( + directory( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_final", + "crosslink", + "tmpdir") + ) + ), + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "total_enrichment_peakstofasta.py" + ), + output_dir = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_final", + "crosslink", + ), + genome_tag = config["genome_tag"], + peak_size = 50, + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "motif_analysis_final", + "peaks_split_sets__all_motifs_crosslink.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "motif_analysis_final", + "peaks_split_sets__all_motifs_crosslink.stderr.log" + ), + + shell: + "(mkdir -p {params.output_dir}; \ + mkdir -p {output.dirtemp}; \ + python {params.script} \ + --peaks_file {input.reads} \ + --genome_fasta {input.genome_fasta} \ + --all_regions {input.all_regions} \ + --genome_tag {params.genome_tag} \ + --tempdir {output.dirtemp} \ + --transcriptome_fasta {input.transcriptome_fasta} \ + --chromosomes_g {input.chromosomes_g} \ + --chromosomes_t {input.chromosomes_t} \ + --peak_size {params.peak_size} \ + --out_folder {params.output_dir} \ + --crosslink_type 'crosslink' \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_peaks_split_sets_all_motifs: + """ + get all peaks crosslink + """ + input: + transcriptome_fasta = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_replicates.transcriptome.fa"), + genome_fasta = config["genome"], + all_regions = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_total_regions.csv" + ), + reads = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_total_peaks.csv" + ), + chromosomes_g = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt" + ), + chromosomes_t = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "replicates_STAR_index", + "chrNameLength.txt"), + + + output: + test = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_final", + "peak_center", + "test.fasta") + ), + test_bg = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_final", + "peak_center", + "test_bg.fasta") + ), + dirtemp = temp( + directory( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_final", + "peak_center", + "tmpdir") + ) + ), + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "total_enrichment_peakstofasta.py" + ), + output_dir = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "motif_analysis_final", + "peak_center", + ), + genome_tag = config["genome_tag"], + peak_size = 50 + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "motif_analysis_final", + "peaks_split_sets__all_motifs_standard.stdout.log" + ), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "motif_analysis_final", + "peaks_split_sets__all_motifs_standard.stderr.log" + ), + + shell: + "(mkdir -p {params.output_dir}; \ + mkdir -p {output.dirtemp}; \ + python {params.script} \ + --peaks_file {input.reads} \ + --genome_fasta {input.genome_fasta} \ + --all_regions {input.all_regions} \ + --genome_tag {params.genome_tag} \ + --tempdir {output.dirtemp} \ + --transcriptome_fasta {input.transcriptome_fasta} \ + --chromosomes_g {input.chromosomes_g} \ + --chromosomes_t {input.chromosomes_t} \ + --peak_size {params.peak_size} \ + --out_folder {params.output_dir} \ + --crosslink_type 'peak_center' \ + ) 1> {log.stdout} 2> {log.stderr}" + diff --git a/workflow/rules/rcrunch_transcriptomic_pe.smk b/workflow/rules/rcrunch_transcriptomic_pe.smk index 92fedef..9208e35 100644 --- a/workflow/rules/rcrunch_transcriptomic_pe.smk +++ b/workflow/rules/rcrunch_transcriptomic_pe.smk @@ -9,7 +9,7 @@ # ________________________________________________________________________________ import os -rule TR_initial_map_to_genome: +rule TR_initial_map_to_genome_pe: """ Initial mapping to the genome using STAR. """ @@ -24,18 +24,18 @@ rule TR_initial_map_to_genome: os.path.join( config["output_dir"], "cutadapt", - "{sample}_mate1.cutadapt_{format}.fastq.gz"), + "{sample}_mate1.pe.cutadapt_{format}.fastq.gz"), sample=wildcards.sample, - format=config[wildcards.sample]['format'] + format=config[wildcards.sample]['format'], ), reads2 = lambda wildcards: expand( os.path.join( config["output_dir"], "cutadapt", - "{sample}_mate2.cutadapt_{format}.fastq.gz"), + "{sample}_mate2.pe.cutadapt_{format}.fastq.gz"), sample=wildcards.sample, - format=config[wildcards.sample]['format'] + format=config[wildcards.sample]['format'], ), output: @@ -45,25 +45,14 @@ rule TR_initial_map_to_genome: "TR", "initial_map", "{sample}", - "{sample}.bam") - ), + "{sample}.pe.bam")), logfile = temp( os.path.join( config["output_dir"], "TR", "initial_map", "{sample}", - "{sample}Log.final.out") - ), - outdir = temp( - directory( - os.path.join( - config["output_dir"], - "TR", - "initial_map", - "{sample}") - ) - ), + "{sample}.pe.Log.final.out")), params: cluster_log_path = config["cluster_log"], @@ -76,9 +65,11 @@ rule TR_initial_map_to_genome: "TR", "initial_map", "{sample}", - "{sample}"), + "{sample}.pe."), multimappers = config["multimappers"] + shadow: "full" + singularity: "docker://zavolab/star:2.6.0a" @@ -89,12 +80,12 @@ rule TR_initial_map_to_genome: config["local_log"], "TR", "preprocessing", - "{sample}_initial_map_to_genome.stdout.log"), + "{sample}_initial_map_to_genome.pe.stdout.log"), stderr = os.path.join( config["local_log"], "TR", "preprocessing", - "{sample}_initial_map_to_genome.stderr.log"), + "{sample}_initial_map_to_genome.pe.stderr.log"), shell: "(STAR \ @@ -120,178 +111,163 @@ rule TR_initial_map_to_genome: ) 1> {log.stdout} 2> {log.stderr}" -rule TR_initial_index: + + +rule TR_umi_collapse_pe: """ - Index genome bamfile using samtools. + Alternative duplicate removal when UMIs are present using umitools. """ input: - outdir = os.path.join( + bam = os.path.join( config["output_dir"], "TR", - "initial_map", - "{sample}"), - bam = os.path.join( + "remove_ncRNAs", + "{sample}.filtered.bam"), + bai = os.path.join( config["output_dir"], "TR", - "initial_map", - "{sample}", - "{sample}.bam"), + "remove_ncRNAs", + "{sample}.filtered.bam.bai"), output: - bam = temp( - os.path.join( - config["output_dir"], - "TR", - "initial_map", - "{sample}.sorted.bam") - ), - bai = temp( - os.path.join( - config["output_dir"], - "TR", - "initial_map", - "{sample}.sorted.bam.bai") - ), - - params: - cluster_log_path = config["cluster_log"], - prefix = os.path.join( + bam = temp(os.path.join( config["output_dir"], "TR", - "initial_map", - "{sample}_temp"), + "remove_duplicates", + "{sample}.umis.pe.bam") + ), - singularity: - "docker://zavolab/samtools:1.8" + params: + cluster_log_path = config["cluster_log"] - threads: 1 + singularity: + "docker://zavolab/umi-tools:0.5.4" log: - stdout = os.path.join( + stdout = os.path.join( config["local_log"], "TR", "preprocessing", - "{sample}_initial_index.stdout.log"), + "{sample}_umi_collapse.pe.stdout.log"), stderr = os.path.join( config["local_log"], "TR", "preprocessing", - "{sample}_initial_index.stderr.log"), + "{sample}_umi_collapse.pe.stderr.log"), shell: - "(samtools sort \ - -T {params.prefix} \ - -@ {threads} \ - {input.bam} > {output.bam}; \ - samtools index \ - {output.bam} {output.bai}) \ - 1> {log.stdout} 2> {log.stderr}" + "(umi_tools \ + dedup \ + -I {input.bam} \ + --paired \ + -S {output.bam} \ + ) 1> {log.stdout} 2> {log.stderr}" -rule TR_flag_ncRNA_reads: +rule TR_bam_to_fastq_pe: """ - Make txt file of reads mapping to ncRNAs. - If a multimapper maps to a ncRNA, all reads are excluded. - Custom script. + Convert initially mapped reads back to fastq files to do mappings at the + transcriptome and genome level, using bedtools. """ input: bam = os.path.join( config["output_dir"], "TR", - "initial_map", - "{sample}.sorted.bam"), - bai = os.path.join( + "{experiment}", + "{experiment}_{name}_merge_samples.sortedname.bam"), + bam_coord = os.path.join( config["output_dir"], "TR", - "initial_map", - "{sample}.sorted.bam.bai"), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_filter_ncRNAs.py"), - ncRNAs = config['ncRNAs'], - ncRNA_biotypes = expand(config['ncRNA_biotypes']), - paired = lambda wildcards: - config[wildcards.sample]['paired'], - sense = lambda wildcards: - config[wildcards.sample]['sense'], + "{experiment}", + "{experiment}_{name}_merge_samples.sorted.bam"), output: - outfile = temp( + fastq_mate1 = temp( os.path.join( config["output_dir"], "TR", - "remove_ncRNAs", - "{sample}.ncrna.txt") + "{experiment}", + "{experiment}_{name}.mate1.pe.unfiltered.fastq") ), - flag = temp( + fastq_mate2 = temp( os.path.join( config["output_dir"], "TR", - "remove_ncRNAs", - "{sample}_done") + "{experiment}", + "{experiment}_{name}.mate2.pe.unfiltered.fastq") ), + params: + cluster_log_path = config["cluster_log"], + singularity: - "docker://zavolab/rcrunch_python:1.0.5" + "docker://zavolab/bedtools:2.27.0" + + threads: 1 log: stdout = os.path.join( config["local_log"], "TR", - "preprocessing", - "{sample}_flag_ncRNA_reads.stdout.log"), + "{experiment}", + "{name}_bam_to_fastq.pe.stdout.log"), stderr = os.path.join( config["local_log"], "TR", - "preprocessing", - "{sample}_flag_ncRNA_reads.stderr.log"), + "{experiment}", + "{name}_bam_to_fastq.pe.stderr.log"), shell: - "(python {params.script} \ - --bamfile {input.bam} \ - --RNA_central {params.ncRNAs} \ - --outfile {output.outfile} \ - --paired {params.paired} \ - --sense {params.sense} \ - --ncRNAs {params.ncRNA_biotypes} \ - --flag {output.flag} \ + "(bedtools bamtofastq \ + -i {input.bam} \ + -fq {output.fastq_mate1} \ + -fq2 {output.fastq_mate2}; \ ) 1> {log.stdout} 2> {log.stderr}" -rule TR_remove_ncRNA_reads: +rule TR_bam_to_fastq_filtered_pe: """ - Remove the reads mapping to rRNAs using picard. + Remove duplicate reads after converting bam to fastq + (when there are multimappers the reads will appear + multiple times in the fastq). """ input: - bam = os.path.join( + mate1 = os.path.join( config["output_dir"], "TR", - "initial_map", - "{sample}.sorted.bam"), - read_names = os.path.join( + "{experiment}", + "{experiment}_{name}.mate1.pe.unfiltered.fastq"), + mate2 = os.path.join( config["output_dir"], "TR", - "remove_ncRNAs", - "{sample}.ncrna.txt"), + "{experiment}", + "{experiment}_{name}.mate2.pe.unfiltered.fastq"), output: - reads = temp( + mate1 = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.mate1.pe.fastq.gz") + ), + mate2 = temp( os.path.join( config["output_dir"], "TR", - "remove_ncRNAs", - "{sample}.filtered.bam") + "{experiment}", + "{experiment}_{name}.mate2.pe.fastq.gz") ), params: cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_fastq_keep_unique_reads.py"), singularity: - "docker://zavolab/picard:2.18.9" + "docker://zavolab/rcrunch_python:1.0.5" threads: 1 @@ -299,2600 +275,284 @@ rule TR_remove_ncRNA_reads: stdout = os.path.join( config["local_log"], "TR", - "preprocessing", - "{sample}_remove_ncRNA_reads.stdout.log"), + "{experiment}", + "{name}_bam_to_fastq_filtered.pe.stdout.log"), stderr = os.path.join( config["local_log"], "TR", - "preprocessing", - "{sample}_remove_ncRNA_reads.stderr.log"), + "{experiment}", + "{name}_bam_to_fastq_filtered.pe.stderr.log"), shell: - "(java -jar /usr/local/bin/picard.jar FilterSamReads \ - I={input.bam} \ - O={output.reads} \ - READ_LIST_FILE={input.read_names} \ - FILTER=excludeReadList) \ - 1> {log.stdout} 2> {log.stderr}" + "(python {params.script} \ + --fastq_in {input.mate1} \ + --fastq_out {output.mate1} \ + --fastq_in2 {input.mate2} \ + --fastq_out2 {output.mate2}; \ + ) 1> {log.stdout} 2> {log.stderr}" -rule TR_index_ncRNA_rm: +rule TR_tr_quantification_pe: """ - Index bamfile without the rRNA-mapped reads using samtools. + Quantification of transcript isoforms using Salmon. """ input: - bam = os.path.join( + transcriptome = os.path.join( + config["output_dir"], + "TR_salmon_index"), + mate1 = os.path.join( config["output_dir"], "TR", - "remove_ncRNAs", - "{sample}.filtered.bam"), + "{experiment}", + "{experiment}_{name}.mate1.pe.fastq.gz"), + mate2 = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.mate2.pe.fastq.gz"), output: - bai = temp( + quantification = temp( os.path.join( config["output_dir"], "TR", - "remove_ncRNAs", - "{sample}.filtered.bam.bai") + "{experiment}", + "tr_quantification_pe_{name}", + "quant.sf") ), - params: cluster_log_path = config["cluster_log"], + library = lambda wildcards: get_library_type( + config[config[wildcards.experiment][wildcards.name][0]]['mate2'], + config[config[wildcards.experiment][wildcards.name][0]]['sense']), + outdir = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_quantification_pe_{name}"), + + shadow: "full" singularity: - "docker://zavolab/samtools:1.8" + "docker://zavolab/salmon:0.11.0" - threads: 1 + threads: 8 log: stdout = os.path.join( config["local_log"], "TR", - "preprocessing", - "{sample}_index_ncRNA_rm.stdout.log"), + "{experiment}", + "{name}_tr_quantification.pe.stdout.log"), stderr = os.path.join( config["local_log"], "TR", - "preprocessing", - "{sample}_index_ncRNA_rm.stderr.log"), + "{experiment}", + "{name}_tr_quantification.pe.stderr.log"), shell: - "(samtools index \ - {input.bam} {output.bai} \ + "(salmon quant \ + -i {input.transcriptome} \ + -l {params.library} \ + -1 {input.mate1} \ + -2 {input.mate2} \ + -o {params.outdir} \ + -p {threads} \ ) 1> {log.stdout} 2> {log.stderr}" -rule TR_flag_duplicates: +rule TR_map_to_transcriptome_pe: """ - Flag duplicate reads using the STAR flags. + Map to the transcriptome treating each transcript + as a "chromosome", using STAR. """ input: - bam = os.path.join( + transcriptome = os.path.join( config["output_dir"], "TR", - "remove_ncRNAs", - "{sample}.filtered.bam"), - bai = os.path.join( + "{experiment}", + "replicates_STAR_index", + "chrNameLength.txt"), + mate1 = os.path.join( config["output_dir"], "TR", - "remove_ncRNAs", - "{sample}.filtered.bam.bai"), + "{experiment}", + "{experiment}_{name}.mate1.pe.fastq.gz"), + mate2 = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.mate2.pe.fastq.gz"), output: bam = temp( os.path.join( config["output_dir"], "TR", - "flag_duplicates", - "{sample}.duplicates.Processed.out.bam") + "{experiment}", + "tr_map", + "{name}", + "transcriptome.pe.bam") + ), + logfile = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_map", + "{name}", + "transcriptome.pe.Log.final.out") ), - params: cluster_log_path = config["cluster_log"], + transcriptome = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "replicates_STAR_index"), + experiment_id = "{experiment}_{name}_transcriptome", + fragment_size = config["fragment_size"], outFileNamePrefix = os.path.join( config["output_dir"], "TR", - "flag_duplicates", - "{sample}.duplicates."), - + "{experiment}", + "tr_map", + "{name}", + "transcriptome.pe."), + multimappers = config['multimappers'] + 10, + + shadow: "full" + singularity: "docker://zavolab/star:2.6.0a" + threads: 12 + log: stdout = os.path.join( config["local_log"], "TR", - "preprocessing", - "{sample}_flag_duplicates.stdout.log"), + "{experiment}", + "{name}_map_to_transcriptome.pe.stdout.log"), stderr = os.path.join( config["local_log"], "TR", - "preprocessing", - "{sample}_flag_duplicates.stderr.log"), + "{experiment}", + "{name}_map_to_transcriptome.pe.stderr.log"), shell: "(STAR \ - --inputBAMfile {input.bam} \ - --bamRemoveDuplicatesType UniqueIdenticalNotMulti \ - --runMode inputAlignmentsFromBAM \ + --runMode alignReads \ + --runThreadN {threads} \ + --genomeDir {params.transcriptome} \ + --readFilesIn {input.mate1} {input.mate2} \ + --readFilesCommand zcat \ + --outSAMunmapped None \ + --outFilterMultimapNmax {params.multimappers} \ + --outFilterMultimapScoreRange 1 \ --outFileNamePrefix {params.outFileNamePrefix} \ + --outFilterScoreMinOverLread 0.9 \ + --outFilterMatchNminOverLread 0.9 \ + --outFilterMismatchNoverLmax 0.05 \ + --outSAMattributes All \ + --outStd BAM_Unsorted \ + --outSAMtype BAM Unsorted \ + --outFilterType BySJout \ + --outReadsUnmapped None \ + --outSAMattrRGline ID:rcrunch SM:{params.experiment_id} \ + --alignIntronMax 1 \ + --alignEndsType EndToEnd > {output.bam}; \ ) 1> {log.stdout} 2> {log.stderr}" -rule TR_remove_duplicates: + +rule TR_map_to_genome_pe: """ - Remove reads flagged as duplicates by STAR. - Custom script. + Initial mapping to the genome using STAR. """ input: - bam = os.path.join( + genome = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt"), + mate1 = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.mate1.pe.fastq.gz"), + mate2 = os.path.join( config["output_dir"], "TR", - "flag_duplicates", - "{sample}.duplicates.Processed.out.bam"), + "{experiment}", + "{experiment}_{name}.mate2.pe.fastq.gz"), output: - bam = temp( - os.path.join( - config["output_dir"], - "TR", - "remove_duplicates", - "{sample}.duplicates.bam") - ), - + bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map", + "{name}", + "genome.pe.bam"), + logfile = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map", + "{name}", + "genome.pe.Log.final.out"), params: cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_filter_duplicates.py"), - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "preprocessing", - "{sample}_remove_duplicates.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "preprocessing", - "{sample}_remove_duplicates.stderr.log"), - - shell: - "(python {params.script} \ - --bamfile {input.bam} \ - --outfile {output.bam} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_no_duplicate_removal: - """ - No removal of duplicates - """ - input: - bam = os.path.join( - config["output_dir"], - "TR", - "remove_ncRNAs", - "{sample}.filtered.bam"), - bai = os.path.join( - config["output_dir"], - "TR", - "remove_ncRNAs", - "{sample}.filtered.bam.bai"), - - output: - bam = temp( - os.path.join( - config["output_dir"], - "TR", - "remove_duplicates", - "{sample}.with_duplicates.bam") - ), - bai = temp( - os.path.join( - config["output_dir"], - "TR", - "remove_duplicates", - "{sample}.with_duplicates.bam.bai") - ), - - params: - cluster_log_path = config["cluster_log"], - - singularity: - "docker://bash:5.0.16" - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "preprocessing", - "{sample}_no_duplicate_removal.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "preprocessing", - "{sample}_no_duplicate_removal.stderr.log"), - - shell: - "(cp {input.bam} {output.bam}; \ - cp {input.bai} {output.bai}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_umi_collapse: - """ - Alternative duplicate removal when UMIs are present using umitools. - """ - input: - bam = os.path.join( - config["output_dir"], - "TR", - "remove_ncRNAs", - "{sample}.filtered.bam"), - bai = os.path.join( - config["output_dir"], - "TR", - "remove_ncRNAs", - "{sample}.filtered.bam.bai"), - - output: - bam = temp(os.path.join( - config["output_dir"], - "TR", - "remove_duplicates", - "{sample}.umis.bam") - ), - - params: - cluster_log_path = config["cluster_log"] - - singularity: - "docker://zavolab/umi-tools:0.5.4" - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "preprocessing", - "{sample}_umi_collapse.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "preprocessing", - "{sample}_umi_collapse.stderr.log"), - - shell: - "(umi_tools \ - dedup \ - -I {input.bam} \ - --paired \ - -S {output.bam} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_merge: - input: - bam = lambda wildcards: - expand( - os.path.join( - config["output_dir"], - "TR", - "remove_duplicates", - "{sample}.{dup_type}.bam"), - sample=config[wildcards.experiment][wildcards.name], - dup_type=config[config[wildcards.experiment][wildcards.name][0]]['dup_type'], - ), - - output: - bam = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_merge_samples.bam") - ), - - params: - cluster_log_path = config["cluster_log"], - - singularity: - "docker://zavolab/samtools:1.8" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_merge.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_merge.stderr.log"), - - shell: - "(samtools merge \ - {output.bam} {input.bam}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_sort_merged: - input: - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_merge_samples.bam" - ), - - output: - bam = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_merge_samples.sorted.bam") - ), - bai = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_merge_samples.sorted.bam.bai") - ), - - params: - cluster_log_path = config["cluster_log"], - prefix = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{name}_merge_samples"), - - singularity: - "docker://zavolab/samtools:1.8" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_sort_merged.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_sort_merged.stderr.log"), - - shell: - "(samtools sort \ - -T {params.prefix} \ - -@ {threads} \ - {input.bam} > {output.bam}; \ - samtools index \ - {output.bam} {output.bai} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_sortname_merged: - input: - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_merge_samples.bam"), - - output: - bam = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_merge_samples.sortedname.bam") - ), - - params: - cluster_log_path = config["cluster_log"], - prefix = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{name}_merge_samples_sortname"), - - singularity: - "docker://zavolab/samtools:1.8" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_sortname_merged.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_sortname_merged.stderr.log"), - - shell: - "(samtools sort -n \ - -T {params.prefix} \ - -@ {threads} \ - {input.bam} > {output.bam}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_read_frequencies: - """ - Calculate occurence of reads that are multimappers. - Custom script. - """ - input: - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_merge_samples.sorted.bam"), - bai = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_merge_samples.sorted.bam.bai"), - - output: - frequencies = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.frequencies.csv") - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_bam_get_read_frequencies.py"), - paired = lambda wildcards: - config[config[wildcards.experiment][wildcards.name][0]]['paired'], - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_read_frequencies.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_read_frequencies.stderr.log"), - - shell: - "(python {params.script} \ - --bamfile {input.bam} \ - --paired {params.paired} \ - --outfile {output.frequencies}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_bam_to_fastq: - """ - Convert initially mapped reads back to fastq files to do mappings at the - transcriptome and genome level, using bedtools. - """ - input: - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_merge_samples.sortedname.bam"), - bam_coord = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_merge_samples.sorted.bam"), - - output: - fastq_mate1 = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate1.unfiltered.fastq") - ), - fastq_mate2 = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate2.unfiltered.fastq") - ), - - params: - cluster_log_path = config["cluster_log"], - - singularity: - "docker://zavolab/bedtools:2.27.0" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_bam_to_fastq.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_bam_to_fastq.stderr.log"), - - shell: - "(bedtools bamtofastq \ - -i {input.bam} \ - -fq {output.fastq_mate1} \ - -fq2 {output.fastq_mate2}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_bam_to_fastq_filtered: - """ - Remove duplicate reads after converting bam to fastq - (when there are multimappers the reads will appear - multiple times in the fastq). - """ - input: - mate1 = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate1.unfiltered.fastq"), - mate2 = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate2.unfiltered.fastq"), - - output: - mate1 = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate1.fastq.gz") - ), - mate2 = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate2.fastq.gz") - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_fastq_keep_unique_reads.py"), - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_bam_to_fastq_filtered.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_bam_to_fastq_filtered.stderr.log"), - - shell: - "(python {params.script} \ - --fastq_in {input.mate1} \ - --fastq_out {output.mate1} \ - --fastq_in2 {input.mate2} \ - --fastq_out2 {output.mate2}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_salmon_index: - """ - Build salmon index based on transcript annotation. - Required for transcript quantification done by salmon. - """ - input: - transcriptome = config["transcriptome"] - - output: - index = directory(os.path.join( - config["output_dir"], - "TR_salmon_index") - ), - - params: - cluster_log_path = config["cluster_log"], - - singularity: - "docker://zavolab/salmon:0.11.0" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "salmon_index.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "salmon_index.stderr.log"), - - shell: - "(salmon index \ - -t {input.transcriptome} \ - -i {output.index} \ - --type fmd; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_tr_quantification: - """ - Quantification of transcript isoforms using Salmon. - """ - input: - transcriptome = os.path.join( - config["output_dir"], - "TR_salmon_index"), - mate1 = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate1.fastq.gz"), - mate2 = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate2.fastq.gz"), - - output: - quantification = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_quantification_{name}", - "quant.sf") - ), - outdir = temp( - directory( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_quantification_{name}") - ) - ), - - params: - cluster_log_path = config["cluster_log"], - - singularity: - "docker://zavolab/salmon:0.11.0" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_tr_quantification.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_tr_quantification.stderr.log"), - - shell: - "(salmon quant \ - -i {input.transcriptome} \ - -l ISR \ - -1 {input.mate1} \ - -2 {input.mate2} \ - -o {output.outdir} \ - -p {threads} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_customised_transcriptome: - """ - Build customised transcriptome keeping - the most expressed isoform per gene. - Custom script. - """ - input: - transcriptome = config["transcriptome"], - ensembl_csv = os.path.join( - config["output_dir"], - "ensembl_csv", - "ensembl.csv"), - ensembl_gtf = config["gtf"], - quantification = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_quantification_{name}", - "quant.sf"), - outdir = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_quantification_{name}"), - - output: - output_gtf = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.transcriptome.gtf"), - output_fasta = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.transcriptome.fa"), - info = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.info.csv"), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_most_expressed_transcriptome.py"), - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_customised_transcriptome.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_customised_transcriptome.stderr.log"), - - shell: - "(python {params.script} \ - --quantification_file {input.quantification} \ - --transcriptome {input.transcriptome} \ - --ensembl_csv {input.ensembl_csv} \ - --ensembl_gtf {input.ensembl_gtf} \ - --out_gtf {output.output_gtf} \ - --transcript_info {output.info} \ - --out_fasta {output.output_fasta} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_transcriptome_index: - """ - Create index of the customised transcriptome using STAR. - """ - input: - tr_fasta = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.transcriptome.fa"), - tr_gtf = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.transcriptome.gtf"), - - output: - output = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{name}_STAR_index", - "chrNameLength.txt"), - - params: - cluster_log_path = config["cluster_log"], - output_dir = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{name}_STAR_index"), - outFileNamePrefix = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{name}_STAR_index/STAR_"), - sjdbOverhang = config["sjdbOverhang"] - - singularity: - "docker://zavolab/star:2.6.0a" - - threads: 12 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_transcriptome_index.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_transcriptome_index.stderr.log"), - - shell: - "(chmod -R 777 {params.output_dir}; \ - STAR \ - --runMode genomeGenerate \ - --genomeSAindexNbases 12 \ - --genomeChrBinNbits 12 \ - --genomeDir {params.output_dir} \ - --genomeFastaFiles {input.tr_fasta} \ - --runThreadN {threads} \ - --outFileNamePrefix {params.outFileNamePrefix} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_map_to_transcriptome: - """ - Map to the transcriptome treating each transcript - as a "chromosome", using STAR. - """ - input: - transcriptome = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "replicates_STAR_index", - "chrNameLength.txt"), - mate1 = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate1.fastq.gz"), - mate2 = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate2.fastq.gz"), - - output: - bam = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_map", - "{name}", - "transcriptome.bam") - ), - logfile = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_map", - "{name}", - "transcriptomeLog.final.out") - ), - outdir = temp( - directory( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_map", - "{name}") - ) - ), - params: - cluster_log_path = config["cluster_log"], - transcriptome = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "replicates_STAR_index"), - experiment_id = "{experiment}_{name}_transcriptome", - fragment_size = config["fragment_size"], - outFileNamePrefix = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_map", - "{name}", - "transcriptome"), - multimappers = config['multimappers'] + 10, - - singularity: - "docker://zavolab/star:2.6.0a" - - threads: 12 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_map_to_transcriptome.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_map_to_transcriptome.stderr.log"), - - shell: - "(STAR \ - --runMode alignReads \ - --runThreadN {threads} \ - --genomeDir {params.transcriptome} \ - --readFilesIn {input.mate1} {input.mate2} \ - --readFilesCommand zcat \ - --outSAMunmapped None \ - --outFilterMultimapNmax {params.multimappers} \ - --outFilterMultimapScoreRange 1 \ - --outFileNamePrefix {params.outFileNamePrefix} \ - --outFilterScoreMinOverLread 0.9 \ - --outFilterMatchNminOverLread 0.9 \ - --outFilterMismatchNoverLmax 0.05 \ - --outSAMattributes All \ - --outStd BAM_Unsorted \ - --outSAMtype BAM Unsorted \ - --outFilterType BySJout \ - --outReadsUnmapped None \ - --outSAMattrRGline ID:rcrunch SM:{params.experiment_id} \ - --alignIntronMax 1 \ - --alignEndsType EndToEnd > {output.bam}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_index_map_to_transcriptome: - """ - Index the transcriptomic alignment. - """ - input: - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_map", - "{name}", - "transcriptome.bam" - ), - outdir = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_map", - "{name}") - - output: - bam = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_map_sort", - "{name}", - "transcriptome.sorted.bam") - ), - bai = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_map_sort", - "{name}", - "transcriptome.sorted.bam.bai") - ), - - params: - cluster_log_path = config["cluster_log"], - prefix = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_map", - "{name}", - "transcriptome_temp"), - - singularity: - "docker://zavolab/samtools:1.8" - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_index_map_to_transcriptome.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_index_map_to_transcriptome.stderr.log"), - - shell: - "(samtools sort \ - -T {params.prefix} \ - -@ {threads} \ - {input.bam} > {output.bam}; \ - samtools index \ - {output.bam} {output.bai} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_map_to_genome: - """ - Initial mapping to the genome using STAR. - """ - input: - genome = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt"), - mate1 = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate1.fastq.gz"), - mate2 = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}.mate2.fastq.gz"), - - output: - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "gn_map", - "{name}", - "genome.bam"), - logfile = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "gn_map", - "{name}", - "genomeLog.final.out"), - outdir = temp( - directory( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "gn_map", - "{name}") - ) - ), - - params: - cluster_log_path = config["cluster_log"], - experiment_id = "{experiment}_{name}_genome", - genome = os.path.join( - config["output_dir"], - "STAR_index"), - outFileNamePrefix = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "gn_map", - "{name}", - "genome"), - multimappers = config["multimappers"] - - singularity: - "docker://zavolab/star:2.6.0a" - - threads: 12 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_map_to_genome.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_map_to_genome.stderr.log"), - - shell: - "(STAR \ - --runMode alignReads \ - --runThreadN {threads} \ - --genomeDir {params.genome} \ - --readFilesIn {input.mate1} {input.mate2} \ - --readFilesCommand zcat \ - --outSAMunmapped None \ - --outFilterMultimapNmax {params.multimappers} \ - --outFilterMultimapScoreRange 1 \ - --outFilterScoreMinOverLread 0.9 \ - --outFilterMatchNminOverLread 0.9 \ - --outFilterMismatchNoverLmax 0.05 \ - --outFileNamePrefix {params.outFileNamePrefix} \ - --outSAMattributes All \ - --outStd BAM_Unsorted \ - --outSAMtype BAM Unsorted \ - --outFilterType BySJout \ - --outReadsUnmapped None \ - --outSAMattrRGline ID:rcrunch SM:{params.experiment_id} \ - --alignEndsType EndToEnd > {output.bam}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_index_map_to_genome: - """ - Index the genomic alignment. - """ - input: - outdir = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "gn_map", - "{name}"), - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "gn_map", - "{name}", - "genome.bam"), - - output: - bam = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "gn_map_sort", - "{name}", - "genome.sorted.bam") - ), - bai = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "gn_map_sort", - "{name}", - "genome.sorted.bam.bai") - ), - params: - cluster_log_path = config["cluster_log"], - prefix = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "gn_map_sort", - "{name}", - "genome_temp"), - - singularity: - "docker://zavolab/samtools:1.8" - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_index_map_to_genome.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_index_map_to_genome.stderr.log"), - - shell: - "(samtools sort \ - -T {params.prefix} \ - -@ {threads} \ - {input.bam} > {output.bam}; \ - samtools index \ - {output.bam} {output.bai} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_assign_to_gn_tr: - """ - Remove reads that map to '-' strand in transcriptome - """ - input: - ensembl_csv = os.path.join( - config["output_dir"], - "ensembl_csv", - "ensembl.csv"), - chromosome_info = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt"), - gn_bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "gn_map_sort", - "{name}", - "genome.sorted.bam"), - gn_bai = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "gn_map_sort", - "{name}", - "genome.sorted.bam.bai"), - tr_bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_map_sort", - "{name}", - "transcriptome.sorted.bam"), - tr_bai = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "tr_map_sort", - "{name}", - "transcriptome.sorted.bam.bai"), - - output: - tr_bam = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_transcriptome.bam") - ), - gn_bam = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_genome.bam") - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_assign_read_to_gn_tr.py"), - out_folder = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{name}_assign_to_gn_tr_temp"), - paired = lambda wildcards: - config[config[wildcards.experiment][wildcards.name][0]]['paired'], - sense = lambda wildcards: - config[config[wildcards.experiment][wildcards.name][0]]['sense'], - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 12 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_assign_to_gn_tr.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_assign_to_gn_tr.stderr.log"), - - shell: - "(mkdir -p {params.out_folder} ;\ - python {params.script} \ - --annotation {input.ensembl_csv} \ - --tr_bam {input.tr_bam} \ - --gn_bam {input.gn_bam} \ - --out_folder {params.out_folder} \ - --filtered_tr_bam {output.tr_bam} \ - --filtered_gn_bam {output.gn_bam} \ - --paired {params.paired} \ - --sense {params.sense} \ - --chromosome_info {input.chromosome_info} \ - --threads {threads} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_transcriptome_filter_sort: - """ - Sort and index transcriptomic reads after - filtering wrong strand reads and - those that map better to genome - """ - input: - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_transcriptome.bam"), - - output: - bam = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_transcriptome.sorted.bam") - ), - bai = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_transcriptome.sorted.bam.bai") - ), - - params: - cluster_log_path = config["cluster_log"], - prefix = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{name}_sort_samples"), - - singularity: - "docker://zavolab/samtools:1.8" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_transcriptome_filter_sort.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_transcriptome_filter_sort.stderr.log"), - - shell: - "(samtools sort \ - -T {params.prefix} \ - -@ {threads} \ - {input.bam} > {output.bam}; \ - samtools index \ - {output.bam} {output.bai} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_genome_filter_sort: - """ - Sort and index genomic reads after filtering wrong strand reads and - those that map better to genome - """ - input: - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_genome.bam"), - - output: - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_genome.sorted.bam"), - bai = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_genome.sorted.bam.bai"), - - params: - cluster_log_path = config["cluster_log"], - prefix = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{name}_gn_sort_samples"), - - singularity: - "docker://zavolab/samtools:1.8" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_genome_filter_sort.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_genome_filter_sort.stderr.log"), - - shell: - "(samtools sort \ - -T {params.prefix} \ - -@ {threads} \ - {input.bam} > {output.bam}; \ - samtools index \ - {output.bam} {output.bai} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_bam_to_bed: - """ - Convert bamfile to bedfile using bedtools - """ - input: - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_genome.sorted.bam"), - - output: - bed = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_genome.bed") - ), - - params: - cluster_log_path = config["cluster_log"], - - singularity: - "docker://zavolab/bedtools:2.27.0" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_bam_to_bed.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_bam_to_bed.stderr.log"), - - shell: - "(bedtools bamtobed \ - -i {input.bam} > {output.bed}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_bam_to_bed_transcriptome: - """ - Convert bamfile to bedfile using bedtools - """ - input: - bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_transcriptome.sorted.bam"), - - output: - bed = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_{name}_transcriptome.bed") - ), - - params: - cluster_log_path = config["cluster_log"], - - singularity: - "docker://zavolab/bedtools:2.27.0" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_bam_to_bed_transcriptome.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "{name}_bam_to_bed_transcriptome.stderr.log"), - - shell: - "(bedtools bamtobed \ - -i {input.bam} > {output.bed}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_genome_coverage: - """ - Count number of fragments in windows - of specific size at the genome. - Custom script. - """ - input: - chromosome_info = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt"), - fg_bed = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates_genome.bed"), - bg_bed = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_smis_genome.bed"), - fg_frequencies = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates.frequencies.csv"), - bg_frequencies = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_smis.frequencies.csv"), - - output: - coverage = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}.genome_coverage.bed") - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "sliding_windows.py"), - output_dir = os.path.join( - config["output_dir"], - "TR", - "{experiment}"), - window_f = lambda wildcards: - config[wildcards.experiment]["window_f"], - window_b = lambda wildcards: - config[wildcards.experiment]["window_b"], - step_size = lambda wildcards: - config[wildcards.experiment]["step_size"], - sense_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['sense'], - sense_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['sense'], - paired_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['paired'], - paired_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['paired'], - prefix = "{experiment}.genome_coverage", - background_type = lambda wildcards: - config[wildcards.experiment]["background_type"], - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "genome_coverage.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "genome_coverage.stderr.log"), - - shell: - "(python {params.script} \ - --bed_foreground {input.fg_bed} \ - --bed_background {input.bg_bed} \ - --foreground_frequencies {input.fg_frequencies} \ - --background_frequencies {input.bg_frequencies} \ - --out {params.output_dir} \ - --prefix {params.prefix} \ - --chromosomes {input.chromosome_info} \ - --sense_f {params.sense_f} \ - --sense_b {params.sense_b} \ - --paired_f {params.paired_f} \ - --paired_b {params.paired_b} \ - --window_f {params.window_f} \ - --window_b {params.window_b} \ - --step_size {params.step_size} \ - --data_type genome \ - --background {params.background_type} \ - --cutoff 1 \ - --threads {threads}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_transcriptome_coverage: - """ - Count number of fragments in windows - of specific size at - the customised transcriptome. - Custom script. - """ - input: - chromosome_info = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "replicates_STAR_index", - "chrNameLength.txt"), - fg_bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates_transcriptome.sorted.bam"), - fg_bai = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates_transcriptome.sorted.bam.bai"), - bg_bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_smis_transcriptome.sorted.bam"), - bg_bai = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_smis_transcriptome.sorted.bam.bai"), - fg_frequencies = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates.frequencies.csv"), - bg_frequencies = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_smis.frequencies.csv"), - - output: - coverage = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}.transcriptome_coverage.bed") - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_bam_count_sliding_windows.py"), - output_dir = os.path.join( - config["output_dir"], - "TR", - "{experiment}"), - window_f = lambda wildcards: - config[wildcards.experiment]["window_f"], - window_b = lambda wildcards: - config[wildcards.experiment]["window_b"], - step_size = lambda wildcards: - config[wildcards.experiment]["step_size"], - sense_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['sense'], - sense_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['sense'], - paired_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['paired'], - paired_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['paired'], - prefix = "{experiment}.transcriptome_coverage", - background_type = lambda wildcards: - config[wildcards.experiment]["background_type"] - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 8 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "transcriptome_coverage.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "transcriptome_coverage.stderr.log"), - - shell: - "(python {params.script} \ - --bam_foreground {input.fg_bam} \ - --bam_background {input.bg_bam} \ - --bam_foreground_frequencies {input.fg_frequencies} \ - --bam_background_frequencies {input.bg_frequencies} \ - --out {params.output_dir} \ - --prefix {params.prefix} \ - --chromosomes {input.chromosome_info} \ - --sense_f {params.sense_f} \ - --sense_b {params.sense_b} \ - --paired_f {params.paired_f} \ - --paired_b {params.paired_b} \ - --window_f {params.window_f} \ - --window_b {params.window_b} \ - --step_size {params.step_size} \ - --data_type transcriptome \ - --background {params.background_type} \ - --cutoff 1 \ - --threads {threads}; \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_merge_windows: - """ - Merge windows from foreground/ background. - Custom script. - """ - input: - tr_coverage = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}.transcriptome_coverage.bed"), - gn_coverage = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}.genome_coverage.bed"), - - output: - merged_coverage = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}.total_coverage.bed") - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_merge_fg_bg.py"), - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "merge_windows.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "merge_windows.stderr.log"), - - shell: - "(python {params.script} \ - --transcriptome {input.tr_coverage} \ - --genome {input.gn_coverage} \ - --out {output.merged_coverage} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_enriched_regions: - """ - Obtain enriched regions for binding based on EM using - bg for noise estimation. Modified and adapted from: - Berger, Severin M., et al. "Crunch: Integrated processing and modeling of - ChIP-seq data in terms of regulatory motifs." - Genome research (2019): gr-239319 - Custom script. - """ - input: - coverage = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}.total_coverage.bed"), - - output: - enriched_regions = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_enriched_regions.csv"), - parameters = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_parameters.csv"), - all_regions = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_total_regions.csv"), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_find_enriched_regions.py"), - plot_dir = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "enriched_regions_plots"), - fdr_cutoff = config['FDR'] - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "enriched_regions.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "enriched_regions.stderr.log"), - - shell: - "(mkdir -p {params.plot_dir}; \ - python {params.script} \ - --windows_table {input.coverage} \ - --out {output.enriched_regions} \ - --all_regions {output.all_regions} \ - --out_parameters {output.parameters} \ - --out_folder {params.plot_dir} \ - --fdr_cutoff {params.fdr_cutoff} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_fit_peaks: - """ - Fit individual binding events in the enriched regions. - Modifed and adapted from: - Berger, Severin M., et al. "Crunch: Integrated processing and - modeling of ChIP-seq data in terms of regulatory motifs." - Genome research (2019): gr-239319. - Custom script. - """ - input: - fg_gn_bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates_genome.sorted.bam"), - fg_gn_bai = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates_genome.sorted.bam.bai"), - bg_gn_bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_smis_genome.sorted.bam"), - bg_gn_bai = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_smis_genome.sorted.bam.bai"), - fg_tr_bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates_transcriptome.sorted.bam"), - fg_tr_bai = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates_transcriptome.sorted.bam.bai"), - bg_tr_bam = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_smis_transcriptome.sorted.bam"), - bg_tr_bai = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_smis_transcriptome.sorted.bam.bai"), - fg_frequencies = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates.frequencies.csv"), - bg_frequencies = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_smis.frequencies.csv"), - enriched_regions = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_enriched_regions.csv"), - parameters = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_parameters.csv"), - chromosomes_g = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt"), - chromosomes_t = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "replicates_STAR_index", - "chrNameLength.txt"), - - output: - peaks = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_total_peaks.csv"), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_define_peaks.py"), - plot_dir = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "fit_peaks_plots"), - fragment_size = config["fragment_size"], - window_size = lambda wildcards: - config[wildcards.experiment]['window_f'], - paired_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['paired'], - paired_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['paired'], - sense_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['sense'], - sense_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['sense'], - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 12 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "fit_peaks.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "fit_peaks.stderr.log"), - - shell: - "(mkdir -p {params.plot_dir}; \ - python {params.script} \ - --enriched_regions {input.enriched_regions} \ - --parameters {input.parameters} \ - --fragment_size {params.fragment_size} \ - --window_size {params.window_size} \ - --chromosomes_g {input.chromosomes_g} \ - --bam_foreground_g {input.fg_gn_bam} \ - --bam_background_g {input.bg_gn_bam} \ - --chromosomes_t {input.chromosomes_t} \ - --bam_foreground_t {input.fg_tr_bam} \ - --bam_background_t {input.bg_tr_bam} \ - --bam_fg_fq {input.fg_frequencies} \ - --bam_bg_fq {input.bg_frequencies} \ - --paired_f {params.paired_f} \ - --paired_b {params.paired_b} \ - --sense_f {params.sense_f} \ - --sense_b {params.sense_b} \ - --out {output.peaks} \ - --out_folder {params.plot_dir} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_peaks_split_sets_crosslink: - """ - Split the significant peaks into training and test - for subsequent motif analysis - """ - input: - genome_fasta = config["genome"], - transcriptome_fasta = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates.transcriptome.fa"), - all_regions = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_total_regions.csv"), - reads = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_total_peaks.csv"), - chromosomes_g = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt"), - chromosomes_t = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "replicates_STAR_index", - "chrNameLength.txt"), - - output: - training = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink", - "training.fasta") - ), - test = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink", - "test.fasta") - ), - training_bg = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink", - "training_bg.fasta") - ), - test_bg = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink", - "test_bg.fasta") - ), - dirtemp = temp( - directory( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink", - "tmpdir") - ) - ), - - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_peaks_split_sets.py"), - output_dir = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "crosslink"), - genome_tag = config["genome_tag"], - peak_size = 50 - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}_peaks_split_sets_crosslink.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}_peaks_split_sets_crosslink.stderr.log"), - - shell: - "(mkdir -p {output.dirtemp}; \ - python {params.script} \ - --peaks_file {input.reads} \ - --genome_fasta {input.genome_fasta} \ - --all_regions {input.all_regions} \ - --genome_tag {params.genome_tag} \ - --tempdir {output.dirtemp} \ - --transcriptome_fasta {input.transcriptome_fasta} \ - --chromosomes_g {input.chromosomes_g} \ - --chromosomes_t {input.chromosomes_t} \ - --crosslink_type 'crosslink' \ - --peak_size {params.peak_size} \ - --out_folder {params.output_dir} \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_peaks_split_sets: - """ - Split the significant peaks into training and test - for subsequent motif analysis - """ - input: - genome_fasta = config["genome"], - transcriptome_fasta = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates.transcriptome.fa"), - all_regions = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_total_regions.csv"), - reads = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_total_peaks.csv"), - chromosomes_g = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt"), - chromosomes_t = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "replicates_STAR_index", - "chrNameLength.txt"), - - output: - training = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center", - "training.fasta") - ), - test = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center", - "test.fasta") - ), - training_bg = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center", - "training_bg.fasta") - ), - test_bg = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center", - "test_bg.fasta") - ), - dirtemp = temp( - directory( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center", - "tmpdir") - ) - ), - - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "mk_peaks_split_sets.py"), - output_dir = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}", - "peak_center"), - genome_tag = config["genome_tag"], - peak_size = 50 - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}_peaks_split_sets_peak_center.stdout.log"), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "motif_analysis_crossvalidation", - "{run}_peaks_split_sets_peak_center.stderr.log"), - - shell: - "(mkdir -p {output.dirtemp}; \ - python {params.script} \ - --peaks_file {input.reads} \ - --genome_fasta {input.genome_fasta} \ - --all_regions {input.all_regions} \ - --tempdir {output.dirtemp} \ - --genome_tag {params.genome_tag} \ - --transcriptome_fasta {input.transcriptome_fasta} \ - --chromosomes_g {input.chromosomes_g} \ - --chromosomes_t {input.chromosomes_t} \ - --crosslink_type 'peak_center' \ - --out_folder {params.output_dir} \ - ) 1> {log.stdout} 2> {log.stderr}" - -rule TR_peaks_split_sets_crosslink_all_motifs: - """ - Get all peaks crosslink - """ - input: - transcriptome_fasta = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates.transcriptome.fa"), - genome_fasta = config["genome"], - all_regions = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_total_regions.csv" - ), - reads = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_total_peaks.csv" - ), - chromosomes_g = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt" - ), - chromosomes_t = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "replicates_STAR_index", - "chrNameLength.txt"), - - output: - test = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_final", - "crosslink", - "test.fasta") - ), - test_bg = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_final", - "crosslink", - "test_bg.fasta") - ), - dirtemp = temp( - directory( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_final", - "crosslink", - "tmpdir") - ) - ), - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "total_enrichment_peakstofasta.py" - ), - output_dir = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_final", - "crosslink", - ), - genome_tag = config["genome_tag"], - peak_size = 50, - - singularity: - "docker://zavolab/rcrunch_python:1.0.5" - - threads: 1 - - log: - stdout = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "motif_analysis_final", - "peaks_split_sets__all_motifs_crosslink.stdout.log" - ), - stderr = os.path.join( - config["local_log"], - "TR", - "{experiment}", - "motif_analysis_final", - "peaks_split_sets__all_motifs_crosslink.stderr.log" - ), - - shell: - "(mkdir -p {params.output_dir}; \ - mkdir -p {output.dirtemp}; \ - python {params.script} \ - --peaks_file {input.reads} \ - --genome_fasta {input.genome_fasta} \ - --all_regions {input.all_regions} \ - --genome_tag {params.genome_tag} \ - --tempdir {output.dirtemp} \ - --transcriptome_fasta {input.transcriptome_fasta} \ - --chromosomes_g {input.chromosomes_g} \ - --chromosomes_t {input.chromosomes_t} \ - --peak_size {params.peak_size} \ - --out_folder {params.output_dir} \ - --crosslink_type 'crosslink' \ - ) 1> {log.stdout} 2> {log.stderr}" - - -rule TR_peaks_split_sets_all_motifs: - """ - get all peaks crosslink - """ - input: - transcriptome_fasta = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_replicates.transcriptome.fa"), - genome_fasta = config["genome"], - all_regions = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_total_regions.csv" - ), - reads = os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "{experiment}_total_peaks.csv" - ), - chromosomes_g = os.path.join( - config["output_dir"], - "STAR_index", - "chrNameLength.txt" - ), - chromosomes_t = os.path.join( + experiment_id = "{experiment}_{name}_genome", + genome = os.path.join( config["output_dir"], - "TR", - "{experiment}", - "replicates_STAR_index", - "chrNameLength.txt"), - - - output: - test = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_final", - "peak_center", - "test.fasta") - ), - test_bg = temp( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_final", - "peak_center", - "test_bg.fasta") - ), - dirtemp = temp( - directory( - os.path.join( - config["output_dir"], - "TR", - "{experiment}", - "motif_analysis_final", - "peak_center", - "tmpdir") - ) - ), - params: - cluster_log_path = config["cluster_log"], - script = os.path.join( - workflow.basedir, - "scripts", - "total_enrichment_peakstofasta.py" - ), - output_dir = os.path.join( + "STAR_index"), + outFileNamePrefix = os.path.join( config["output_dir"], "TR", "{experiment}", - "motif_analysis_final", - "peak_center", - ), - genome_tag = config["genome_tag"], - peak_size = 50 - + "gn_map", + "{name}", + "genome.pe."), + multimappers = config["multimappers"] + shadow: "full" + singularity: - "docker://zavolab/rcrunch_python:1.0.5" + "docker://zavolab/star:2.6.0a" - threads: 1 + threads: 12 log: stdout = os.path.join( config["local_log"], "TR", "{experiment}", - "motif_analysis_final", - "peaks_split_sets__all_motifs_standard.stdout.log" - ), + "{name}_map_to_genome.pe.stdout.log"), stderr = os.path.join( config["local_log"], "TR", "{experiment}", - "motif_analysis_final", - "peaks_split_sets__all_motifs_standard.stderr.log" - ), + "{name}_map_to_genome.pe.stderr.log"), shell: - "(mkdir -p {params.output_dir}; \ - mkdir -p {output.dirtemp}; \ - python {params.script} \ - --peaks_file {input.reads} \ - --genome_fasta {input.genome_fasta} \ - --all_regions {input.all_regions} \ - --genome_tag {params.genome_tag} \ - --tempdir {output.dirtemp} \ - --transcriptome_fasta {input.transcriptome_fasta} \ - --chromosomes_g {input.chromosomes_g} \ - --chromosomes_t {input.chromosomes_t} \ - --peak_size {params.peak_size} \ - --out_folder {params.output_dir} \ - --crosslink_type 'peak_center' \ - ) 1> {log.stdout} 2> {log.stderr}" - + "(STAR \ + --runMode alignReads \ + --runThreadN {threads} \ + --genomeDir {params.genome} \ + --readFilesIn {input.mate1} {input.mate2} \ + --readFilesCommand zcat \ + --outSAMunmapped None \ + --outFilterMultimapNmax {params.multimappers} \ + --outFilterMultimapScoreRange 1 \ + --outFilterScoreMinOverLread 0.9 \ + --outFilterMatchNminOverLread 0.9 \ + --outFilterMismatchNoverLmax 0.05 \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --outSAMattributes All \ + --outStd BAM_Unsorted \ + --outSAMtype BAM Unsorted \ + --outFilterType BySJout \ + --outReadsUnmapped None \ + --outSAMattrRGline ID:rcrunch SM:{params.experiment_id} \ + --alignEndsType EndToEnd > {output.bam}; \ + ) 1> {log.stdout} 2> {log.stderr}" \ No newline at end of file diff --git a/workflow/rules/rcrunch_transcriptomic_se.smk b/workflow/rules/rcrunch_transcriptomic_se.smk new file mode 100644 index 0000000..24eb64c --- /dev/null +++ b/workflow/rules/rcrunch_transcriptomic_se.smk @@ -0,0 +1,508 @@ +# -------------------------------------------------------------------------------- +# RCRUNCH +# Author : Katsantoni Maria +# Company: Mihaela Zavolan group, Biozentrum, Basel +# -------------------------------------------------------------------------------- +# Transcriptomic RCRUNCH subpipeline for paired-end CLIP data +# This approach is based on the hypothesis that the RBP is binding the mature +# mRNA and thus the sequence of the mature mRNA is important for the binding. +# ________________________________________________________________________________ +import os + +rule TR_initial_map_to_genome_se: + """ + Initial mapping to the genome using STAR. + """ + input: + genome = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt" + ), + reads = lambda wildcards: + expand( + os.path.join( + config["output_dir"], + "cutadapt", + "{sample}_mate1.se.cutadapt_{format}.fastq.gz"), + sample=wildcards.sample, + format=config[wildcards.sample]['format'] + ), + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "initial_map", + "{sample}", + "{sample}.se.bam") + ), + logfile = temp( + os.path.join( + config["output_dir"], + "TR", + "initial_map", + "{sample}", + "{sample}.se.Log.final.out") + ), + + params: + cluster_log_path = config["cluster_log"], + sample_id = "{sample}", + genome = os.path.join( + config["output_dir"], + "STAR_index"), + outFileNamePrefix = os.path.join( + config["output_dir"], + "TR", + "initial_map", + "{sample}", + "{sample}.se."), + multimappers = config["multimappers"] + + shadow: "full" + + singularity: + "docker://zavolab/star:2.6.0a" + + threads: 12 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_initial_map_to_genome.se.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_initial_map_to_genome.se.stderr.log"), + + shell: + "(STAR \ + --runMode alignReads \ + --runThreadN {threads} \ + --genomeDir {params.genome} \ + --readFilesIn {input.reads} \ + --readFilesCommand zcat \ + --outSAMunmapped None \ + --outFilterMultimapNmax {params.multimappers} \ + --outFilterMultimapScoreRange 1 \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --outSAMattributes All \ + --outStd BAM_Unsorted \ + --outSAMtype BAM Unsorted \ + --outFilterScoreMinOverLread 0.9 \ + --outFilterMatchNminOverLread 0.9 \ + --outFilterMismatchNoverLmax 0.05 \ + --outFilterType BySJout \ + --outReadsUnmapped None \ + --outSAMattrRGline ID:rcrunch SM:{params.sample_id} \ + --alignEndsType EndToEnd > {output.bam}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_umi_collapse_se: + """ + Alternative duplicate removal when UMIs are present using umitools. + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}.filtered.bam"), + bai = os.path.join( + config["output_dir"], + "TR", + "remove_ncRNAs", + "{sample}.filtered.bam.bai"), + + output: + bam = temp(os.path.join( + config["output_dir"], + "TR", + "remove_duplicates", + "{sample}.umis.se.bam") + ), + + params: + cluster_log_path = config["cluster_log"] + + singularity: + "docker://zavolab/umi-tools:0.5.4" + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_umi_collapse.se.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "preprocessing", + "{sample}_umi_collapse.se.stderr.log"), + + shell: + "(umi_tools \ + dedup \ + -I {input.bam} \ + -S {output.bam} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_bam_to_fastq_se: + """ + Convert initially mapped reads back to fastq files to do mappings at the + transcriptome and genome level, using bedtools. + """ + input: + bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_merge_samples.sortedname.bam"), + bam_coord = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}_merge_samples.sorted.bam"), + + output: + fastq_mate1 = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.mate1.se.unfiltered.fastq") + ) + params: + cluster_log_path = config["cluster_log"], + + singularity: + "docker://zavolab/bedtools:2.27.0" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_bam_to_fastq.se.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_bam_to_fastq.se.stderr.log"), + + shell: + "(bedtools bamtofastq \ + -i {input.bam} \ + -fq {output.fastq_mate1};) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_bam_to_fastq_filtered_se: + """ + Remove duplicate reads after converting bam to fastq + (when there are multimappers the reads will appear + multiple times in the fastq). + """ + input: + mate1 = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.mate1.se.unfiltered.fastq"), + + output: + mate1 = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.mate1.se.fastq.gz") + ), + + params: + cluster_log_path = config["cluster_log"], + script = os.path.join( + workflow.basedir, + "scripts", + "mk_fastq_keep_unique_reads.py"), + + singularity: + "docker://zavolab/rcrunch_python:1.0.5" + + threads: 1 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_bam_to_fastq_filtered.se.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_bam_to_fastq_filtered.se.stderr.log"), + + shell: + "(python {params.script} \ + --fastq_in {input.mate1} \ + --fastq_out {output.mate1};) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_tr_quantification_se: + """ + Quantification of transcript isoforms using Salmon. + """ + input: + transcriptome = os.path.join( + config["output_dir"], + "TR_salmon_index"), + reads = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.mate1.se.fastq.gz"), + + output: + quantification = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_quantification_se_{name}", + "quant.sf") + ), + + params: + cluster_log_path = config["cluster_log"], + library = lambda wildcards: get_library_type( + config[wildcards.experiment][wildcards.name][0], + config[config[wildcards.experiment][wildcards.name][0]]['sense']), + outdir = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_quantification_se_{name}"), + + shadow: "full" + singularity: + "docker://zavolab/salmon:0.11.0" + + threads: 8 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_tr_quantification.se.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_tr_quantification.se.stderr.log"), + + shell: + "(salmon quant \ + -i {input.transcriptome} \ + -l {params.library} \ + -r {input.reads} \ + -o {params.outdir} \ + -p {threads} \ + ) 1> {log.stdout} 2> {log.stderr}" + + +rule TR_map_to_transcriptome_se: + """ + Map to the transcriptome treating each transcript + as a "chromosome", using STAR. + """ + input: + transcriptome = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "replicates_STAR_index", + "chrNameLength.txt"), + mate1 = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.mate1.se.fastq.gz"), + + output: + bam = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_map", + "{name}", + "transcriptome.se.bam") + ), + logfile = temp( + os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_map", + "{name}", + "transcriptome.se.Log.final.out") + ), + params: + cluster_log_path = config["cluster_log"], + transcriptome = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "replicates_STAR_index"), + experiment_id = "{experiment}_{name}_transcriptome", + fragment_size = config["fragment_size"], + outFileNamePrefix = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "tr_map", + "{name}", + "transcriptome.se."), + multimappers = config['multimappers'] + 10, + + shadow: "full" + + singularity: + "docker://zavolab/star:2.6.0a" + + threads: 12 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_map_to_transcriptome.se.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_map_to_transcriptome.se.stderr.log"), + + shell: + "(STAR \ + --runMode alignReads \ + --runThreadN {threads} \ + --genomeDir {params.transcriptome} \ + --readFilesIn {input.mate1} \ + --readFilesCommand zcat \ + --outSAMunmapped None \ + --outFilterMultimapNmax {params.multimappers} \ + --outFilterMultimapScoreRange 1 \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --outFilterScoreMinOverLread 0.9 \ + --outFilterMatchNminOverLread 0.9 \ + --outFilterMismatchNoverLmax 0.05 \ + --outSAMattributes All \ + --outStd BAM_Unsorted \ + --outSAMtype BAM Unsorted \ + --outFilterType BySJout \ + --outReadsUnmapped None \ + --outSAMattrRGline ID:rcrunch SM:{params.experiment_id} \ + --alignIntronMax 1 \ + --alignEndsType EndToEnd > {output.bam}; \ + ) 1> {log.stdout} 2> {log.stderr}" + + + +rule TR_map_to_genome_se: + """ + Initial mapping to the genome using STAR. + """ + input: + genome = os.path.join( + config["output_dir"], + "STAR_index", + "chrNameLength.txt"), + mate1 = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "{experiment}_{name}.mate1.se.fastq.gz"), + + output: + bam = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map", + "{name}", + "genome.se.bam"), + logfile = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map", + "{name}", + "genome.se.Log.final.out"), + params: + cluster_log_path = config["cluster_log"], + experiment_id = "{experiment}_{name}_genome", + genome = os.path.join( + config["output_dir"], + "STAR_index"), + outFileNamePrefix = os.path.join( + config["output_dir"], + "TR", + "{experiment}", + "gn_map", + "{name}", + "genome.se."), + multimappers = config["multimappers"] + + shadow: "full" + + singularity: + "docker://zavolab/star:2.6.0a" + + threads: 12 + + log: + stdout = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_map_to_genome.se.stdout.log"), + stderr = os.path.join( + config["local_log"], + "TR", + "{experiment}", + "{name}_map_to_genome.se.stderr.log"), + + shell: + "(STAR \ + --runMode alignReads \ + --runThreadN {threads} \ + --genomeDir {params.genome} \ + --readFilesIn {input.mate1} \ + --readFilesCommand zcat \ + --outSAMunmapped None \ + --outFilterMultimapNmax {params.multimappers} \ + --outFilterMultimapScoreRange 1 \ + --outFilterScoreMinOverLread 0.9 \ + --outFilterMatchNminOverLread 0.9 \ + --outFilterMismatchNoverLmax 0.05 \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --outSAMattributes All \ + --outStd BAM_Unsorted \ + --outSAMtype BAM Unsorted \ + --outFilterType BySJout \ + --outReadsUnmapped None \ + --outSAMattrRGline ID:rcrunch SM:{params.experiment_id} \ + --alignEndsType EndToEnd > {output.bam}; \ + ) 1> {log.stdout} 2> {log.stderr}" \ No newline at end of file From 0783378e17f4365c539960401396a5acb0f08d01 Mon Sep 17 00:00:00 2001 From: BIOPZ-Katsantoni Maria Date: Fri, 13 Jan 2023 11:58:45 +0100 Subject: [PATCH 3/4] feat: addition of testing for single, paired end and TR in addition to GN --- .../performance_evaluation.py | 148 +++++++++--------- 1 file changed, 76 insertions(+), 72 deletions(-) diff --git a/test/test_singularity_execution/performance_evaluation.py b/test/test_singularity_execution/performance_evaluation.py index fb3940b..0196c7f 100755 --- a/test/test_singularity_execution/performance_evaluation.py +++ b/test/test_singularity_execution/performance_evaluation.py @@ -12,80 +12,84 @@ def main(): __doc__ = "Evaluate the run based on expected values" outpath = os.path.dirname(os.path.realpath(__file__)) os.chdir(outpath) - a = pd.read_csv( - 'results/GN/PUM2_K562_ENCSR661ICQ_2/PUM2_K562_ENCSR661ICQ_2_total_peaks.csv', - sep='\t', header=0, index_col=None) - expected = pd.read_csv( - 'means_crosslink.tsv', - sep='\t', header=0, index_col=None) - a['center'] = a['start'] + a['crosslink'] - - for index, row in a.iterrows(): - distance = ( - row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min() - if distance <= 50: - a.loc[index, 'distance'] = distance - else: - a.loc[index, 'distance'] = np.nan - rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()])) - outlier_percentage = len(a[a.distance.isna()])/len(a) * 100 - - - # expected = pd.read_csv( - # 'means_peak_center.tsv', - # sep='\t', header=0, index_col=None) - # a['center'] = a['start'] + a['mi'] - - # for index, row in a.iterrows(): - # distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min() - # if distance <= 50: - # a.loc[index, 'distance'] = distance - # else: - # a.loc[index, 'distance'] = np.nan - # sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n' - # ) - # sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n' - # ) - - # add the new test best pwm - path = f'results/GN/PUM2_K562_ENCSR661ICQ_2/motif_analysis_final/peak_center/motif_enrichment.tsv' - a = pd.read_csv(path, sep='\t', header=0, index_col=0) - b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif'] - run = b.split('_')[-1].split('.')[0] - name = path.split('/')[-4] - path_new = path.replace( - 'motif_analysis_final', - f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}') - shutil.copy(path_new, f'wlib/{name}') - - df = pd.DataFrame() - for path1 in glob.glob('wlib/*'): - name1 = path1.split('/')[-1] - for path2 in glob.glob('wlib/*'): - name2 = path2.split('/')[-1] - sim = getsimilarity(path1, path2) - df.loc[name1, name2] = sim - mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(), df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()]) - - sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n') - sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n') - sys.stdout.write(f'motif similarity: {mean_sim}\n') - if rmsd < 1.5: - sys.stdout.write('Rmsd is low. Test passed. 1/3\n') - else: - sys.stdout.write('Rmsd seems to be too high 1/3\n') - if outlier_percentage < 5: - sys.stdout.write('Few outliers detected. Test passed. 2/3\n') - else: - sys.stdout.write('Rmsd seems to be too high 2/3\n') - if mean_sim > 0.75: - sys.stdout.write('Motif similarity is high.Test passed 3/3\n') - else: - sys.stdout.write('Similarity seems to be low 3/3\n') + for runtype in ['GN', 'TR']: + for sample in ["PUM2_K562_ENCSR661ICQ_2_paired_end", "PUM2_K562_ENCSR661ICQ_2_single_end"]: + a = pd.read_csv( + f'results/{runtype}/{sample}/{sample}_total_peaks.csv', + sep='\t', header=0, index_col=None) + expected = pd.read_csv( + 'means_crosslink.tsv', + sep='\t', header=0, index_col=None) + a['center'] = a['start'] + a['crosslink'] + + for index, row in a.iterrows(): + distance = ( + row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min() + if distance <= 50: + a.loc[index, 'distance'] = distance + else: + a.loc[index, 'distance'] = np.nan + if runtype == "GN": + rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()])) + outlier_percentage = len(a[a.distance.isna()])/len(a) * 100 + + + # expected = pd.read_csv( + # 'means_peak_center.tsv', + # sep='\t', header=0, index_col=None) + # a['center'] = a['start'] + a['mi'] + + # for index, row in a.iterrows(): + # distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min() + # if distance <= 50: + # a.loc[index, 'distance'] = distance + # else: + # a.loc[index, 'distance'] = np.nan + # sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n' + # ) + # sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n' + # ) + + # add the new test best pwm + path = f'results/{runtype}/{sample}/motif_analysis_final/peak_center/motif_enrichment.tsv' + a = pd.read_csv(path, sep='\t', header=0, index_col=0) + b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif'] + run = b.split('_')[-1].split('.')[0] + name = path.split('/')[-4] + path_new = path.replace( + 'motif_analysis_final', + f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}') + shutil.copy(path_new, f'wlib/PUM2_K562_ENCSR661ICQ_2') + + df = pd.DataFrame() + for path1 in glob.glob('wlib/*'): + name1 = path1.split('/')[-1] + for path2 in glob.glob('wlib/*'): + name2 = path2.split('/')[-1] + sim = getsimilarity(path1, path2) + df.loc[name1, name2] = sim + mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(), df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()]) + sys.stdout.write(f'{runtype} {sample} evaluation\n') + if runtype == "GN": + sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n') + sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n') + + sys.stdout.write(f'motif similarity: {mean_sim}\n') + if runtype == "GN": + if rmsd < 1.5: + sys.stdout.write('Rmsd is low. Peaks are highly overlapping. Test passed. 1/3\n') + else: + sys.stdout.write('Rmsd seems to be too high. Peaks are far apart. 1/3\n') + if outlier_percentage < 5: + sys.stdout.write('Few outliers detected. Test passed. 2/3\n') + else: + sys.stdout.write('Too many peaks that are not found in the test runs 2/3\n') + if mean_sim > 0.75: + sys.stdout.write('Motif similarity is high.Test passed 3/3\n') + else: + sys.stdout.write('Motif similarity seems to be low 3/3\n\n') return - - def getSimilarityScore(wm1, wm2): wm1 = np.array(wm1) wm2 = np.array(wm2) From 8457d7a2e2e50a7487692835ce0dcbedcceb00a4 Mon Sep 17 00:00:00 2001 From: BIOPZ-Katsantoni Maria Date: Mon, 16 Jan 2023 23:23:04 +0100 Subject: [PATCH 4/4] feat: remove paired specification from config and infer from input --- config.yaml | 2 +- test/test_singularity_execution/config.yaml | 8 ++--- .../performance_evaluation.py | 29 +++++++++++-------- workflow/rules/common.smk | 6 ++-- workflow/rules/rcrunch_genomic.smk | 12 ++++---- workflow/rules/rcrunch_transcriptomic.smk | 19 ++++++------ 6 files changed, 40 insertions(+), 36 deletions(-) diff --git a/config.yaml b/config.yaml index 0b6382a..bb7a088 100644 --- a/config.yaml +++ b/config.yaml @@ -105,7 +105,7 @@ # if file name is ENCFF041KJT.fastq.gz then add ENCFF041KJT # files are assumed to be in fastq.gz format mate2: "ENCFF462SCV", - # if file name is ENCFF041KJT.fastq.gz then add ENCFF041KJT + # if file name is ENCFF462SCV.fastq.gz then add ENCFF462SCV # files are assumed to be in fastq.gz format. paired: 2, # leave as is. used for future support of single-end data diff --git a/test/test_singularity_execution/config.yaml b/test/test_singularity_execution/config.yaml index 648b1fa..c77289c 100644 --- a/test/test_singularity_execution/config.yaml +++ b/test/test_singularity_execution/config.yaml @@ -56,10 +56,10 @@ background_type: "standard"} # Sample specific info - REQUIRED - ENCFF462SCV: { mate1: "ENCFF462SCV.chr20", paired: 1, sense: 1, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" } - ENCFF495ZPY: { mate1: "ENCFF495ZPY.chr20", paired: 1, sense: 1, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" } - ENCFF041KJT_ENCFF462SCV: { mate1: "ENCFF041KJT.chr20", mate2: "ENCFF462SCV.chr20", paired: 2, sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" } - ENCFF616FCF_ENCFF495ZPY: { mate1: "ENCFF616FCF.chr20", mate2: "ENCFF495ZPY.chr20", paired: 2, sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" } + ENCFF462SCV: { mate1: "ENCFF462SCV.chr20", sense: 1, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" } + ENCFF495ZPY: { mate1: "ENCFF495ZPY.chr20", sense: 1, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" } + ENCFF041KJT_ENCFF462SCV: { mate1: "ENCFF041KJT.chr20", mate2: "ENCFF462SCV.chr20", sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" } + ENCFF616FCF_ENCFF495ZPY: { mate1: "ENCFF616FCF.chr20", mate2: "ENCFF495ZPY.chr20", sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" } # ---------------------------------------------------------------------------------------------- # RCRUNCH specific options - REQUIRED - DEFAULT VALUES SUGGESTED method_types : ["GN", "TR"] # options: GN, TR if you want BOTH methods, you can specify ['GN', 'TR'] diff --git a/test/test_singularity_execution/performance_evaluation.py b/test/test_singularity_execution/performance_evaluation.py index 0196c7f..a2e166d 100755 --- a/test/test_singularity_execution/performance_evaluation.py +++ b/test/test_singularity_execution/performance_evaluation.py @@ -7,6 +7,7 @@ import re import io + def main(): """ Main function """ __doc__ = "Evaluate the run based on expected values" @@ -68,28 +69,32 @@ def main(): name2 = path2.split('/')[-1] sim = getsimilarity(path1, path2) df.loc[name1, name2] = sim - mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(), df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()]) - sys.stdout.write(f'{runtype} {sample} evaluation\n') + mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(), + df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()]) + sys.stdout.write(f'''{runtype.replace("GN","Genomic").replace("TR","Transcriptomic")} {"-".join(sample.split("_")[-2:])} evaluation\n''') + sys.stdout.write(f'motif similarity: {mean_sim}\n') if runtype == "GN": sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n') sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n') - sys.stdout.write(f'motif similarity: {mean_sim}\n') + + if mean_sim > 0.75: + sys.stdout.write('Motif similarity is high.Test passed 1/3\n') + else: + sys.stdout.write('Motif similarity seems to be low 1/3\n') if runtype == "GN": if rmsd < 1.5: - sys.stdout.write('Rmsd is low. Peaks are highly overlapping. Test passed. 1/3\n') + sys.stdout.write('Rmsd is low. Peaks are highly overlapping. Test passed. 2/3\n') else: - sys.stdout.write('Rmsd seems to be too high. Peaks are far apart. 1/3\n') + sys.stdout.write('Rmsd seems to be too high. Peaks are far apart. 2/3\n') if outlier_percentage < 5: - sys.stdout.write('Few outliers detected. Test passed. 2/3\n') + sys.stdout.write('Few outliers detected. Test passed. 3/3\n') else: - sys.stdout.write('Too many peaks that are not found in the test runs 2/3\n') - if mean_sim > 0.75: - sys.stdout.write('Motif similarity is high.Test passed 3/3\n') - else: - sys.stdout.write('Motif similarity seems to be low 3/3\n\n') + sys.stdout.write('Too many peaks are not found in the test runs 3/3\n') + sys.stdout.write('\n') return + def getSimilarityScore(wm1, wm2): wm1 = np.array(wm1) wm2 = np.array(wm2) @@ -123,7 +128,7 @@ def get_wm(path): def getsimilarity(wm1_path, wm2_path): wm1 = get_wm(wm1_path) wm2 = get_wm(wm2_path) - similarity = ( + similarity = ( (2 * getSimilarityScore(wm1, wm2)) / ( getSimilarityScore(wm1, wm1) + getSimilarityScore(wm2, wm2))) return similarity diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index fb1b213..43be315 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -12,11 +12,11 @@ def get_mates_number(sample): """Get library type""" try: if config[sample]['mate2']: - return 2 + return '2' else: - return 1 + return '1' except KeyError: - return 2 + return '1' def get_library_type(sample, sense): try: diff --git a/workflow/rules/rcrunch_genomic.smk b/workflow/rules/rcrunch_genomic.smk index e117c45..9d38ce3 100644 --- a/workflow/rules/rcrunch_genomic.smk +++ b/workflow/rules/rcrunch_genomic.smk @@ -112,7 +112,7 @@ rule GN_flag_ncRNA_reads: ncRNAs = config['ncRNAs'], ncRNA_biotypes = expand(config['ncRNA_biotypes']), paired = lambda wildcards: - config[wildcards.sample]['paired'], + get_mates_number(wildcards.sample), sense = lambda wildcards: config[wildcards.sample]['sense'], @@ -583,7 +583,7 @@ rule GN_read_frequencies: params: cluster_log_path = config["cluster_log"], paired = lambda wildcards: - config[config[wildcards.experiment][wildcards.name][0]]['paired'], + get_mates_number(config[wildcards.experiment][wildcards.name][0]), script = os.path.join( workflow.basedir, "scripts", @@ -730,9 +730,9 @@ rule GN_genome_coverage: sense_b = lambda wildcards: config[config[wildcards.experiment]["smis"][0]]['sense'], paired_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['paired'], + get_mates_number(config[wildcards.experiment]["replicates"][0]), paired_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['paired'], + get_mates_number(config[wildcards.experiment]["smis"][0]), prefix = "{experiment}.gn_coverage", background_type = lambda wildcards: config[wildcards.experiment]["background_type"], @@ -932,9 +932,9 @@ rule GN_fit_peaks: window_size = lambda wildcards: config[wildcards.experiment]['window_f'], paired_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['paired'], + get_mates_number(config[wildcards.experiment]["replicates"][0]), paired_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['paired'], + get_mates_number(config[wildcards.experiment]["smis"][0]), sense_f = lambda wildcards: config[config[wildcards.experiment]["replicates"][0]]['sense'], sense_b = lambda wildcards: diff --git a/workflow/rules/rcrunch_transcriptomic.smk b/workflow/rules/rcrunch_transcriptomic.smk index b153535..c432351 100644 --- a/workflow/rules/rcrunch_transcriptomic.smk +++ b/workflow/rules/rcrunch_transcriptomic.smk @@ -111,7 +111,7 @@ rule TR_flag_ncRNA_reads: ncRNAs = config['ncRNAs'], ncRNA_biotypes = expand(config['ncRNA_biotypes']), paired = lambda wildcards: - config[wildcards.sample]['paired'], + get_mates_number(wildcards.sample), sense = lambda wildcards: config[wildcards.sample]['sense'], @@ -611,8 +611,7 @@ rule TR_read_frequencies: "scripts", "mk_bam_get_read_frequencies.py"), paired = lambda wildcards: - config[config[wildcards.experiment][wildcards.name][0]]['paired'], - + get_mates_number(config[wildcards.experiment][wildcards.name][0]), singularity: "docker://zavolab/rcrunch_python:1.0.5" @@ -1060,7 +1059,7 @@ rule TR_assign_to_gn_tr: "{experiment}", "{name}_assign_to_gn_tr_temp"), paired = lambda wildcards: - config[config[wildcards.experiment][wildcards.name][0]]['paired'], + get_mates_number(config[wildcards.experiment][wildcards.name][0]), sense = lambda wildcards: config[config[wildcards.experiment][wildcards.name][0]]['sense'], @@ -1374,9 +1373,9 @@ rule TR_genome_coverage: sense_b = lambda wildcards: config[config[wildcards.experiment]["smis"][0]]['sense'], paired_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['paired'], + get_mates_number(config[wildcards.experiment]["replicates"][0]), paired_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['paired'], + get_mates_number(config[wildcards.experiment]["smis"][0]), prefix = "{experiment}.genome_coverage", background_type = lambda wildcards: config[wildcards.experiment]["background_type"], @@ -1496,9 +1495,9 @@ rule TR_transcriptome_coverage: sense_b = lambda wildcards: config[config[wildcards.experiment]["smis"][0]]['sense'], paired_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['paired'], + get_mates_number(config[wildcards.experiment]["replicates"][0]), paired_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['paired'], + get_mates_number(config[wildcards.experiment]["smis"][0]), prefix = "{experiment}.transcriptome_coverage", background_type = lambda wildcards: config[wildcards.experiment]["background_type"] @@ -1779,9 +1778,9 @@ rule TR_fit_peaks: window_size = lambda wildcards: config[wildcards.experiment]['window_f'], paired_f = lambda wildcards: - config[config[wildcards.experiment]["replicates"][0]]['paired'], + get_mates_number(config[wildcards.experiment]["replicates"][0]), paired_b = lambda wildcards: - config[config[wildcards.experiment]["smis"][0]]['paired'], + get_mates_number(config[wildcards.experiment]["smis"][0]), sense_f = lambda wildcards: config[config[wildcards.experiment]["replicates"][0]]['sense'], sense_b = lambda wildcards: