-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMantaReference_swegen.R
109 lines (82 loc) · 3.13 KB
/
MantaReference_swegen.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
## This script was run using all the diploid vcfs available within each project @ Bianca
library(VariantAnnotation)
library(data.table)
# For SWEGEN
folders=dir(path = 'Manta_SAMPLES/',include.dirs = T)
files=paste0('Manta_SAMPLES/',folders,'/results/variants/diploidSV.vcf.gz')
allkeys=NULL
passkeys=NULL
for (i in 376:length(files)) if (!i %in% c(372,373,375)) {
file=files[i]
cat(i,':->',file,'\n')
nstruc_vcf=readVcf(file = file,genome = 'GRCh38')
g=geno(nstruc_vcf)
inf=info(nstruc_vcf)
rr=rowRanges(nstruc_vcf)
# manipulate into a data frame with relevant data
nstruc=as.data.table(rr)
nstruc$chr <- substr(x = nstruc$seqnames,start = 4,stop = 6)
#nstruc=nstruc[chr %in% c(1:22,'X','Y')]
## Key has only chr,start,end
key=nstruc[,c('chr','start','end')]
key$imprecise='(pr)'
## If imprecise, round the pos to 10
ix=inf$IMPRECISE==T
key$imprecise[ix]='(impr)'
key$start[ix]=round(key$start[ix]/10)*10
key$end[ix]=round(key$end[ix]/10)*10
key=paste(key$chr,key$start,key$end,key$imprecise)
fname=basename(file)
# add unique keys from this pat to the long vectors (to be tabled below)
passkeys=c(passkeys,unique(key[nstruc$FILTER=='PASS']))
allkeys=c(allkeys,unique(key))
}
save(passkeys,allkeys,file='~/mantakeys.Rdata')
## Processing of keysets:
# make tables
manta_reference_all=as.data.table(sort(table(allkeys),decreasing = T))
manta_reference_pass=as.data.table(sort(table(passkeys),decreasing = T))
colnames(manta_reference_all) <- colnames(manta_reference_pass) <- c('name','value')
swegen_hg38_manta_all <- manta_reference_all
setkey(swegen_hg38_manta_all,name)
swegen_hg38_manta_pass <- manta_reference_pass
setkey(swegen_hg38_manta_pass,name)
save(swegen_hg38_manta_all,swegen_hg38_manta_pass,file='~/Swegen_hg38_Manta_counts.Rdata')
fwrite(swegen_hg38_manta_all,'~/reports/swegen_sv_counts.csv')
##### Below: hg19
# For SWEGEN
files=dir(pattern = '.vcf')
library(VariantAnnotation)
library(data.table)
allkeys=NULL
passkeys=NULL
for (i in 1:length(files)) try( {
file=files[i]
cat(i,':->',file,'\n')
nstruc_vcf=readVcf(file = file,genome = 'GRCh37')
g=geno(nstruc_vcf)
inf=info(nstruc_vcf)
rr=rowRanges(nstruc_vcf)
# manipulate into a data frame with relevant data
nstruc=as.data.table(rr)
nstruc$chr <- nstruc$seqnames
#nstruc=nstruc[chr %in% c(1:22,'X','Y')]
## Key has only chr,start,end
key=nstruc[,c('chr','start','end')]
key$imprecise='(pr)'
## If imprecise, round the pos to 10
ix=inf$IMPRECISE==T
key$imprecise[ix]='(impr)'
key$start[ix]=round(key$start[ix]/10)*10
key$end[ix]=round(key$end[ix]/10)*10
key=paste(key$chr,key$start,key$end,key$imprecise)
# add unique keys from this pat to the long vectors (to be tabled below)
passkeys=c(passkeys,unique(key[nstruc$FILTER=='PASS']))
allkeys=c(allkeys,unique(key))
}, silent=T)
save(passkeys,allkeys,file='~/mantakeys_hg19.Rdata')
manta_reference_all <- as.data.table(sort(table(allkeys),decreasing = T))
colnames(manta_reference_all) <- c('name','value')
swegen_hg19_manta_all <- manta_reference_all
setkey(swegen_hg19_manta_all,name)
fwrite(swegen_hg19_manta_all,'~/swegen_sv_counts_hg19.csv')