zavolanlab · mkatsanto · Jan 16, 2023 · Jan 10, 2023 · Jan 13, 2023 · Jan 13, 2023
diff --git a/config.yaml b/config.yaml
@@ -105,7 +105,7 @@
                 # if file name is ENCFF041KJT.fastq.gz then add ENCFF041KJT 
                 # files are assumed to be in fastq.gz format
                 mate2: "ENCFF462SCV", 
-                # if file name is ENCFF041KJT.fastq.gz then add ENCFF041KJT 
+                # if file name is ENCFF462SCV.fastq.gz then add ENCFF462SCV 
                 # files are assumed to be in fastq.gz format. 
                 paired: 2, 
                 # leave as is. used for future support of single-end data

diff --git a/test/test_singularity_execution/config.yaml b/test/test_singularity_execution/config.yaml
@@ -29,7 +29,7 @@
 
   # ------------------------------------------
   # Experiment specific info - REQUIRED
-  EXPERIMENT_SET : ["PUM2_K562_ENCSR661ICQ_2"]
+  EXPERIMENT_SET : ["PUM2_K562_ENCSR661ICQ_2_paired_end", "PUM2_K562_ENCSR661ICQ_2_single_end"]
 
   # Experiment specific info - REQUIRED
   # ***Note: an experiment refers to the foreground (replicates) and background (smis) samples
@@ -38,22 +38,31 @@
             # as one sample (This functionality is useful for treating technical replicates as one sample)
             # Make sure that the samples you merge have the same sample features (e.g paired, sense, dup_type)
             # Choose an informative experiment name
-  PUM2_K562_ENCSR661ICQ_2: { 
+  PUM2_K562_ENCSR661ICQ_2_paired_end: { 
                               rbp: "PUM2",
                               replicates: ["ENCFF041KJT_ENCFF462SCV"],
                               smis: ["ENCFF616FCF_ENCFF495ZPY"],
                               window_f: 300,
                               window_b: 300,
                               step_size: 150,
                               background_type: "standard"}
+  PUM2_K562_ENCSR661ICQ_2_single_end: { 
+                              rbp: "PUM2",
+                              replicates: ["ENCFF462SCV"],
+                              smis: ["ENCFF495ZPY"],
+                              window_f: 300,
+                              window_b: 300,
+                              step_size: 150,
+                              background_type: "standard"}
 
   # Sample specific info - REQUIRED
-  ENCFF041KJT_ENCFF462SCV: { mate1: "ENCFF041KJT.chr20", mate2: "ENCFF462SCV.chr20", paired: 2, sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }
-  ENCFF616FCF_ENCFF495ZPY: { mate1: "ENCFF616FCF.chr20", mate2: "ENCFF495ZPY.chr20", paired: 2, sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }
-
+  ENCFF462SCV: { mate1: "ENCFF462SCV.chr20", sense: 1, format: "encode", dup_type: "umis",  mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" }
+  ENCFF495ZPY: { mate1: "ENCFF495ZPY.chr20", sense: 1, format: "encode", dup_type: "umis",  mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" }
+  ENCFF041KJT_ENCFF462SCV: { mate1: "ENCFF041KJT.chr20", mate2: "ENCFF462SCV.chr20", sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }
+  ENCFF616FCF_ENCFF495ZPY: { mate1: "ENCFF616FCF.chr20", mate2: "ENCFF495ZPY.chr20", sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }
   # ----------------------------------------------------------------------------------------------
   # RCRUNCH specific options - REQUIRED - DEFAULT VALUES SUGGESTED 
-  method_types : ["GN"]         # options: GN, TR  if you want BOTH methods, you can specify ['GN', 'TR']
+  method_types : ["GN", "TR"]         # options: GN, TR  if you want BOTH methods, you can specify ['GN', 'TR']
                                 # This will create both transcriptomic and genomic predictions
   multimappers: 1               # change if you want to accept reads that map to multiple regions 
                                 # (the number indicates the number of regions a read can map to at
@@ -65,7 +74,6 @@
 
 
   ## RCRUNCH specific options - DEFAULTS - Use with care
-  seq_type: "pe"                # paired end - currently only paired-end seq are allowed
   motif_lengths: [6, 8]         # Motif sizes to be considered.
   peak_center : ["crosslink", "peak_center"]   # whether to use  'crosslink' (position within where most reads start)
                                 # or 'peak_center' as the center of the peak

diff --git a/test/test_singularity_execution/performance_evaluation.py b/test/test_singularity_execution/performance_evaluation.py
@@ -7,85 +7,94 @@
 import re
 import io
 
+
 def main():
     """ Main function """
     __doc__ = "Evaluate the run based on expected values"
     outpath = os.path.dirname(os.path.realpath(__file__))
     os.chdir(outpath)
-    a = pd.read_csv(
-        'results/GN/PUM2_K562_ENCSR661ICQ_2/PUM2_K562_ENCSR661ICQ_2_total_peaks.csv',
-        sep='\t', header=0, index_col=None)
-    expected = pd.read_csv(
-        'means_crosslink.tsv',
-        sep='\t', header=0, index_col=None)
-    a['center'] = a['start'] + a['crosslink']
-
-    for index, row in a.iterrows():
-        distance = (
-            row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min()
-        if distance <= 50:
-            a.loc[index, 'distance'] = distance
-        else:
-            a.loc[index, 'distance'] = np.nan
-    rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))
-    outlier_percentage = len(a[a.distance.isna()])/len(a) * 100
-
-
-    # expected  = pd.read_csv(
-    #     'means_peak_center.tsv',
-    #     sep='\t', header=0, index_col=None)
-    # a['center'] = a['start'] +  a['mi']
-
-    # for index, row in a.iterrows():
-    #     distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min()
-    #     if distance <= 50:
-    #         a.loc[index, 'distance'] = distance
-    #     else:
-    #         a.loc[index, 'distance'] = np.nan
-    # sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n'
-    # )
-    # sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n'
-    # )
-
-    # add the new test best pwm
-    path = f'results/GN/PUM2_K562_ENCSR661ICQ_2/motif_analysis_final/peak_center/motif_enrichment.tsv'
-    a = pd.read_csv(path, sep='\t', header=0, index_col=0)
-    b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif']
-    run = b.split('_')[-1].split('.')[0]
-    name = path.split('/')[-4]
-    path_new = path.replace(
-        'motif_analysis_final',
-        f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}')
-    shutil.copy(path_new, f'wlib/{name}')
-
-    df = pd.DataFrame()
-    for path1 in glob.glob('wlib/*'):
-        name1 = path1.split('/')[-1]
-        for path2 in glob.glob('wlib/*'):
-            name2 = path2.split('/')[-1]
-            sim = getsimilarity(path1, path2)
-            df.loc[name1, name2] = sim
-    mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(), df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()])
-
-    sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n')
-    sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n')
-    sys.stdout.write(f'motif similarity: {mean_sim}\n')
-    if rmsd < 1.5:
-        sys.stdout.write('Rmsd is low. Test passed. 1/3\n')
-    else:
-        sys.stdout.write('Rmsd seems to be too high 1/3\n')
-    if outlier_percentage < 5:
-        sys.stdout.write('Few outliers detected. Test passed. 2/3\n')
-    else:
-        sys.stdout.write('Rmsd seems to be too high 2/3\n')
-    if mean_sim > 0.75:
-        sys.stdout.write('Motif similarity is high.Test passed 3/3\n')
-    else:
-        sys.stdout.write('Similarity seems to be low 3/3\n')
+    for runtype in ['GN', 'TR']:
+        for sample in ["PUM2_K562_ENCSR661ICQ_2_paired_end", "PUM2_K562_ENCSR661ICQ_2_single_end"]:
+            a = pd.read_csv(
+                f'results/{runtype}/{sample}/{sample}_total_peaks.csv',
+                sep='\t', header=0, index_col=None)
+            expected = pd.read_csv(
+                'means_crosslink.tsv',
+                sep='\t', header=0, index_col=None)
+            a['center'] = a['start'] + a['crosslink']
+
+            for index, row in a.iterrows():
+                distance = (
+                    row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min()
+                if distance <= 50:
+                    a.loc[index, 'distance'] = distance
+                else:
+                    a.loc[index, 'distance'] = np.nan
+            if runtype == "GN":
+                rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))
+                outlier_percentage = len(a[a.distance.isna()])/len(a) * 100
+
+
+            # expected  = pd.read_csv(
+            #     'means_peak_center.tsv',
+            #     sep='\t', header=0, index_col=None)
+            # a['center'] = a['start'] +  a['mi']
+
+            # for index, row in a.iterrows():
+            #     distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min()
+            #     if distance <= 50:
+            #         a.loc[index, 'distance'] = distance
+            #     else:
+            #         a.loc[index, 'distance'] = np.nan
+            # sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n'
+            # )
+            # sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n'
+            # )
+
+            # add the new test best pwm
+            path = f'results/{runtype}/{sample}/motif_analysis_final/peak_center/motif_enrichment.tsv'
+            a = pd.read_csv(path, sep='\t', header=0, index_col=0)
+            b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif']
+            run = b.split('_')[-1].split('.')[0]
+            name = path.split('/')[-4]
+            path_new = path.replace(
+                'motif_analysis_final',
+                f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}')
+            shutil.copy(path_new, f'wlib/PUM2_K562_ENCSR661ICQ_2')
+
+            df = pd.DataFrame()
+            for path1 in glob.glob('wlib/*'):
+                name1 = path1.split('/')[-1]
+                for path2 in glob.glob('wlib/*'):
+                    name2 = path2.split('/')[-1]
+                    sim = getsimilarity(path1, path2)
+                    df.loc[name1, name2] = sim
+            mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(),
+                                df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()])
+            sys.stdout.write(f'''{runtype.replace("GN","Genomic").replace("TR","Transcriptomic")} {"-".join(sample.split("_")[-2:])} evaluation\n''')
+            sys.stdout.write(f'motif similarity: {mean_sim}\n')
+            if runtype == "GN":
+                sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n')
+                sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n')
+
+
+            if mean_sim > 0.75:
+                sys.stdout.write('Motif similarity is high.Test passed 1/3\n')
+            else:
+                sys.stdout.write('Motif similarity seems to be low 1/3\n')
+            if runtype == "GN":
+                if rmsd < 1.5:
+                    sys.stdout.write('Rmsd is low. Peaks are highly overlapping. Test passed. 2/3\n')
+                else:
+                    sys.stdout.write('Rmsd seems to be too high. Peaks are far apart. 2/3\n')
+                if outlier_percentage < 5:
+                    sys.stdout.write('Few outliers detected. Test passed. 3/3\n')
+                else:
+                    sys.stdout.write('Too many peaks are not found in the test runs 3/3\n')
+            sys.stdout.write('\n')
     return
 
 
-
 def getSimilarityScore(wm1, wm2):
     wm1 = np.array(wm1)
     wm2 = np.array(wm2)
@@ -119,7 +128,7 @@ def get_wm(path):
 def getsimilarity(wm1_path, wm2_path):
     wm1 = get_wm(wm1_path)
     wm2 = get_wm(wm2_path)
-    similarity =  (
+    similarity = (
         (2 * getSimilarityScore(wm1, wm2)) / (
             getSimilarityScore(wm1, wm1) + getSimilarityScore(wm2, wm2)))
     return similarity

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -46,20 +46,16 @@ rule finish:
             # experiment=config[config["EXPERIMENT_SET"][0]]["replicates"])
 
 
-
-if config['seq_type'] == 'pe':
-    if ('TR' in config['method_types']) & ('GN' in config['method_types']):
-        include: 'rules/rcrunch_preprocessing_pe.smk'
-        include: 'rules/rcrunch_genomic_pe.smk'
-        include: 'rules/rcrunch_transcriptomic_pe.smk'
-        include: 'rules/rcrunch_motif_analysis.smk'
-    elif 'TR' in config['method_types']:
-        include: 'rules/rcrunch_preprocessing_pe.smk'
-        include: 'rules/rcrunch_transcriptomic_pe.smk'
-        include: 'rules/rcrunch_motif_analysis.smk'
-    elif 'GN' in config['method_types']:
-        include: 'rules/rcrunch_preprocessing_pe.smk'
-        include: 'rules/rcrunch_genomic_pe.smk'
-        include: 'rules/rcrunch_motif_analysis.smk'
+include: 'rules/rcrunch_preprocessing_pe.smk'
+include: 'rules/rcrunch_preprocessing_se.smk'
+include: 'rules/rcrunch_preprocessing.smk'
+include: 'rules/rcrunch_genomic_pe.smk'
+include: 'rules/rcrunch_genomic_se.smk'
+include: 'rules/rcrunch_genomic.smk'
+include: 'rules/rcrunch_transcriptomic.smk'
+include: 'rules/rcrunch_transcriptomic_pe.smk'
+include: 'rules/rcrunch_transcriptomic_se.smk'
+include: 'rules/rcrunch_motif_analysis.smk'
+include: 'rules/common.smk'
 
 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -0,0 +1,38 @@
+def get_mates(sample):
+    """Get number of mates"""
+    try:
+        if config[sample]['mate2']:
+            return "pe"
+        else:
+            return "se"
+    except KeyError:
+        return "se"
+
+def get_mates_number(sample):
+    """Get library type"""
+    try:
+        if config[sample]['mate2']:
+            return '2'
+        else:
+            return '1'
+    except KeyError:
+        return '1'
+
+def get_library_type(sample, sense):
+    try:
+        if config[sample]['mate2']:
+            if str(int(sense)) == '2':
+                return "ISR"
+            if str(int(sense)) == '1':
+                return "ISF"
+        else:
+            if str(int(sense)) == '2':
+                return "SR"
+            if str(int(sense)) == '1':
+                return "SF"
+    except KeyError:
+        if str(int(sense)) == '2':
+            return "SR"
+        if str(int(sense)) == '1':
+            return "SF"
+