Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support single-stranded clip data #16

Merged
merged 4 commits into from
Jan 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
# if file name is ENCFF041KJT.fastq.gz then add ENCFF041KJT
# files are assumed to be in fastq.gz format
mate2: "ENCFF462SCV",
# if file name is ENCFF041KJT.fastq.gz then add ENCFF041KJT
# if file name is ENCFF462SCV.fastq.gz then add ENCFF462SCV
# files are assumed to be in fastq.gz format.
paired: 2,
# leave as is. used for future support of single-end data
Expand Down
22 changes: 15 additions & 7 deletions test/test_singularity_execution/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

# ------------------------------------------
# Experiment specific info - REQUIRED
EXPERIMENT_SET : ["PUM2_K562_ENCSR661ICQ_2"]
EXPERIMENT_SET : ["PUM2_K562_ENCSR661ICQ_2_paired_end", "PUM2_K562_ENCSR661ICQ_2_single_end"]

# Experiment specific info - REQUIRED
# ***Note: an experiment refers to the foreground (replicates) and background (smis) samples
Expand All @@ -38,22 +38,31 @@
# as one sample (This functionality is useful for treating technical replicates as one sample)
# Make sure that the samples you merge have the same sample features (e.g paired, sense, dup_type)
# Choose an informative experiment name
PUM2_K562_ENCSR661ICQ_2: {
PUM2_K562_ENCSR661ICQ_2_paired_end: {
rbp: "PUM2",
replicates: ["ENCFF041KJT_ENCFF462SCV"],
smis: ["ENCFF616FCF_ENCFF495ZPY"],
window_f: 300,
window_b: 300,
step_size: 150,
background_type: "standard"}
PUM2_K562_ENCSR661ICQ_2_single_end: {
rbp: "PUM2",
replicates: ["ENCFF462SCV"],
smis: ["ENCFF495ZPY"],
window_f: 300,
window_b: 300,
step_size: 150,
background_type: "standard"}

# Sample specific info - REQUIRED
ENCFF041KJT_ENCFF462SCV: { mate1: "ENCFF041KJT.chr20", mate2: "ENCFF462SCV.chr20", paired: 2, sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }
ENCFF616FCF_ENCFF495ZPY: { mate1: "ENCFF616FCF.chr20", mate2: "ENCFF495ZPY.chr20", paired: 2, sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }

ENCFF462SCV: { mate1: "ENCFF462SCV.chr20", sense: 1, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" }
ENCFF495ZPY: { mate1: "ENCFF495ZPY.chr20", sense: 1, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate2_3p.fasta", mate1_5p: "../input_files_test/mate2_5p.fasta" }
ENCFF041KJT_ENCFF462SCV: { mate1: "ENCFF041KJT.chr20", mate2: "ENCFF462SCV.chr20", sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }
ENCFF616FCF_ENCFF495ZPY: { mate1: "ENCFF616FCF.chr20", mate2: "ENCFF495ZPY.chr20", sense: 2, format: "encode", dup_type: "umis", mate1_3p: "../input_files_test/mate1_3p.fasta", mate1_5p: "../input_files_test/mate1_5p.fasta", mate2_3p: "../input_files_test/mate2_3p.fasta", mate2_5p: "../input_files_test/mate2_5p.fasta" }
# ----------------------------------------------------------------------------------------------
# RCRUNCH specific options - REQUIRED - DEFAULT VALUES SUGGESTED
method_types : ["GN"] # options: GN, TR if you want BOTH methods, you can specify ['GN', 'TR']
method_types : ["GN", "TR"] # options: GN, TR if you want BOTH methods, you can specify ['GN', 'TR']
# This will create both transcriptomic and genomic predictions
multimappers: 1 # change if you want to accept reads that map to multiple regions
# (the number indicates the number of regions a read can map to at
Expand All @@ -65,7 +74,6 @@


## RCRUNCH specific options - DEFAULTS - Use with care
seq_type: "pe" # paired end - currently only paired-end seq are allowed
motif_lengths: [6, 8] # Motif sizes to be considered.
peak_center : ["crosslink", "peak_center"] # whether to use 'crosslink' (position within where most reads start)
# or 'peak_center' as the center of the peak
Expand Down
153 changes: 81 additions & 72 deletions test/test_singularity_execution/performance_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,85 +7,94 @@
import re
import io


def main():
""" Main function """
__doc__ = "Evaluate the run based on expected values"
outpath = os.path.dirname(os.path.realpath(__file__))
os.chdir(outpath)
a = pd.read_csv(
'results/GN/PUM2_K562_ENCSR661ICQ_2/PUM2_K562_ENCSR661ICQ_2_total_peaks.csv',
sep='\t', header=0, index_col=None)
expected = pd.read_csv(
'means_crosslink.tsv',
sep='\t', header=0, index_col=None)
a['center'] = a['start'] + a['crosslink']

for index, row in a.iterrows():
distance = (
row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min()
if distance <= 50:
a.loc[index, 'distance'] = distance
else:
a.loc[index, 'distance'] = np.nan
rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))
outlier_percentage = len(a[a.distance.isna()])/len(a) * 100


# expected = pd.read_csv(
# 'means_peak_center.tsv',
# sep='\t', header=0, index_col=None)
# a['center'] = a['start'] + a['mi']

# for index, row in a.iterrows():
# distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min()
# if distance <= 50:
# a.loc[index, 'distance'] = distance
# else:
# a.loc[index, 'distance'] = np.nan
# sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n'
# )
# sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n'
# )

# add the new test best pwm
path = f'results/GN/PUM2_K562_ENCSR661ICQ_2/motif_analysis_final/peak_center/motif_enrichment.tsv'
a = pd.read_csv(path, sep='\t', header=0, index_col=0)
b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif']
run = b.split('_')[-1].split('.')[0]
name = path.split('/')[-4]
path_new = path.replace(
'motif_analysis_final',
f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}')
shutil.copy(path_new, f'wlib/{name}')

df = pd.DataFrame()
for path1 in glob.glob('wlib/*'):
name1 = path1.split('/')[-1]
for path2 in glob.glob('wlib/*'):
name2 = path2.split('/')[-1]
sim = getsimilarity(path1, path2)
df.loc[name1, name2] = sim
mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(), df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()])

sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n')
sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n')
sys.stdout.write(f'motif similarity: {mean_sim}\n')
if rmsd < 1.5:
sys.stdout.write('Rmsd is low. Test passed. 1/3\n')
else:
sys.stdout.write('Rmsd seems to be too high 1/3\n')
if outlier_percentage < 5:
sys.stdout.write('Few outliers detected. Test passed. 2/3\n')
else:
sys.stdout.write('Rmsd seems to be too high 2/3\n')
if mean_sim > 0.75:
sys.stdout.write('Motif similarity is high.Test passed 3/3\n')
else:
sys.stdout.write('Similarity seems to be low 3/3\n')
for runtype in ['GN', 'TR']:
for sample in ["PUM2_K562_ENCSR661ICQ_2_paired_end", "PUM2_K562_ENCSR661ICQ_2_single_end"]:
a = pd.read_csv(
f'results/{runtype}/{sample}/{sample}_total_peaks.csv',
sep='\t', header=0, index_col=None)
expected = pd.read_csv(
'means_crosslink.tsv',
sep='\t', header=0, index_col=None)
a['center'] = a['start'] + a['crosslink']

for index, row in a.iterrows():
distance = (
row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min()
if distance <= 50:
a.loc[index, 'distance'] = distance
else:
a.loc[index, 'distance'] = np.nan
if runtype == "GN":
rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))
outlier_percentage = len(a[a.distance.isna()])/len(a) * 100


# expected = pd.read_csv(
# 'means_peak_center.tsv',
# sep='\t', header=0, index_col=None)
# a['center'] = a['start'] + a['mi']

# for index, row in a.iterrows():
# distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min()
# if distance <= 50:
# a.loc[index, 'distance'] = distance
# else:
# a.loc[index, 'distance'] = np.nan
# sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n'
# )
# sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n'
# )

# add the new test best pwm
path = f'results/{runtype}/{sample}/motif_analysis_final/peak_center/motif_enrichment.tsv'
a = pd.read_csv(path, sep='\t', header=0, index_col=0)
b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif']
run = b.split('_')[-1].split('.')[0]
name = path.split('/')[-4]
path_new = path.replace(
'motif_analysis_final',
f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}')
shutil.copy(path_new, f'wlib/PUM2_K562_ENCSR661ICQ_2')

df = pd.DataFrame()
for path1 in glob.glob('wlib/*'):
name1 = path1.split('/')[-1]
for path2 in glob.glob('wlib/*'):
name2 = path2.split('/')[-1]
sim = getsimilarity(path1, path2)
df.loc[name1, name2] = sim
mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(),
df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()])
sys.stdout.write(f'''{runtype.replace("GN","Genomic").replace("TR","Transcriptomic")} {"-".join(sample.split("_")[-2:])} evaluation\n''')
sys.stdout.write(f'motif similarity: {mean_sim}\n')
if runtype == "GN":
sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n')
sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n')


if mean_sim > 0.75:
sys.stdout.write('Motif similarity is high.Test passed 1/3\n')
else:
sys.stdout.write('Motif similarity seems to be low 1/3\n')
if runtype == "GN":
if rmsd < 1.5:
sys.stdout.write('Rmsd is low. Peaks are highly overlapping. Test passed. 2/3\n')
else:
sys.stdout.write('Rmsd seems to be too high. Peaks are far apart. 2/3\n')
if outlier_percentage < 5:
sys.stdout.write('Few outliers detected. Test passed. 3/3\n')
else:
sys.stdout.write('Too many peaks are not found in the test runs 3/3\n')
sys.stdout.write('\n')
return



def getSimilarityScore(wm1, wm2):
wm1 = np.array(wm1)
wm2 = np.array(wm2)
Expand Down Expand Up @@ -119,7 +128,7 @@ def get_wm(path):
def getsimilarity(wm1_path, wm2_path):
wm1 = get_wm(wm1_path)
wm2 = get_wm(wm2_path)
similarity = (
similarity = (
(2 * getSimilarityScore(wm1, wm2)) / (
getSimilarityScore(wm1, wm1) + getSimilarityScore(wm2, wm2)))
return similarity
Expand Down
26 changes: 11 additions & 15 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,16 @@ rule finish:
# experiment=config[config["EXPERIMENT_SET"][0]]["replicates"])



if config['seq_type'] == 'pe':
if ('TR' in config['method_types']) & ('GN' in config['method_types']):
include: 'rules/rcrunch_preprocessing_pe.smk'
include: 'rules/rcrunch_genomic_pe.smk'
include: 'rules/rcrunch_transcriptomic_pe.smk'
include: 'rules/rcrunch_motif_analysis.smk'
elif 'TR' in config['method_types']:
include: 'rules/rcrunch_preprocessing_pe.smk'
include: 'rules/rcrunch_transcriptomic_pe.smk'
include: 'rules/rcrunch_motif_analysis.smk'
elif 'GN' in config['method_types']:
include: 'rules/rcrunch_preprocessing_pe.smk'
include: 'rules/rcrunch_genomic_pe.smk'
include: 'rules/rcrunch_motif_analysis.smk'
include: 'rules/rcrunch_preprocessing_pe.smk'
include: 'rules/rcrunch_preprocessing_se.smk'
include: 'rules/rcrunch_preprocessing.smk'
include: 'rules/rcrunch_genomic_pe.smk'
include: 'rules/rcrunch_genomic_se.smk'
include: 'rules/rcrunch_genomic.smk'
include: 'rules/rcrunch_transcriptomic.smk'
include: 'rules/rcrunch_transcriptomic_pe.smk'
include: 'rules/rcrunch_transcriptomic_se.smk'
include: 'rules/rcrunch_motif_analysis.smk'
include: 'rules/common.smk'


38 changes: 38 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
def get_mates(sample):
"""Get number of mates"""
try:
if config[sample]['mate2']:
return "pe"
else:
return "se"
except KeyError:
return "se"

def get_mates_number(sample):
"""Get library type"""
try:
if config[sample]['mate2']:
return '2'
else:
return '1'
except KeyError:
return '1'

def get_library_type(sample, sense):
try:
if config[sample]['mate2']:
if str(int(sense)) == '2':
return "ISR"
if str(int(sense)) == '1':
return "ISF"
else:
if str(int(sense)) == '2':
return "SR"
if str(int(sense)) == '1':
return "SF"
except KeyError:
if str(int(sense)) == '2':
return "SR"
if str(int(sense)) == '1':
return "SF"

Loading