-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter.R
136 lines (104 loc) · 3.89 KB
/
filter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# filter.R
#
# Author: Mouhamadou Fadel DIOP
# Date: 2021-10-06
#
# Purpose:
# Remove missing data on sample and locus
# Produce filtered VCF file.
#
# ------------------------------------------------------------------
# NOTE - uncomment these lines to install packages as needed
# install.packages("data.table")
# install.packages("tictoc")
arguments <- commandArgs(trailingOnly = TRUE)
Dir <- arguments[1]
# Load packages
library(data.table)
library(tictoc)
options(scipen=999)
source("Functions.R")
#=============================================
#=== Extracting genotypes from the raw data
#=============================================
setwd(Dir)
vcf <- arguments[2]
Genotypes <- '../process/Genotypes.txt'
expression <- '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n'
system(sprintf("bcftools query -f'%s' %s > %s", expression, vcf, Genotypes))
#=============================================
#-- Computing the missingness on SNPs and samples
#-- Get the sample names from the VCF file
#=============================================
sampleList <- '../process/SampleList.txt'
system(sprintf("bcftools query -l %s > %s", vcf, sampleList))
AfricanSamplesList <- fread(sampleList, header = FALSE)
#=============================================
#---- put the first four columns in a variable
#=============================================
Genotype <- fread(Genotypes, header = FALSE)
first4Column <- subset(Genotype, select=c(1:4))
Genotype <- subset(Genotype, select=-c(1:4))
#===================
## Remove invariants
#==================
varsnp <- apply(Genotype, 1, triquad)
keepvar <- which(varsnp != 1)
data2 <- cbind(first4Column, Genotype)
data2 <- data2[keepvar,]
geno <- data2[,5:ncol(data2)]
first4Column <- data2[,1:4]
nrow(geno)
ncol(geno)
#======================================
#--- computing missingness on SNPs
#======================================
tic();snpMissingness <- computeMissingnessOnSNPs(geno);toc()
geno <- cbind(first4Column, geno, snpMissingness)
geno <- geno[which(geno$snpMissingness <= 0.2), ]
first4Column <- subset(geno, select = c(1:4))
geno <- subset(geno, select = -c(1:4, ncol(geno)))
#================================
#=== Compute Missingness on samples
#================================
tic(); sampleMissingness <- computeMissingness(geno); toc()
geno <- t(geno)
rownames(geno) <- AfricanSamplesList$V1
geno <- as.data.frame(geno)
geno$Missingness <- sampleMissingness
index <- which(geno$Missingness > 0.15)
geno <- subset(geno, select = -c(ncol(geno)))
geno <- as.data.frame(t(geno))
if(!is_empty(index)) geno <- geno[, -index]
#===================
## Remove invariants
#==================
varsnp <- apply(geno, 1, triquad)
keepvar <- which(varsnp != 1)
data2 <- cbind(first4Column, geno)
data2 <- data2[keepvar,]
geno <- data2[,5:ncol(data2)]
first4Column <- data2[,1:4]
nrow(geno)
ncol(geno)
## Save positions and isolates to keep for downstream analysis
write.table(colnames(geno), '../process/samplesTokeep.txt',
col.names = FALSE, row.names = FALSE, quote = FALSE)
write.table(first4Column[,1:2], "../process/snpsTokeep.txt",
col.names = FALSE, row.names = FALSE, quote = FALSE, sep = '\t')
#=========================================================
#---- Removing SNPs and Isolates to discard from vcf file
#========================================================
samplesToKeep <- "../process/samplesTokeep.txt"
snpsToKeep <- "../process/snpsTokeep.txt"
filtered.vcf <- paste0("../process/", gsub(".vcf.gz", ".filtered", vcf))
system(paste0("vcftools --gzvcf ", vcf,
" --keep ", samplesToKeep,
" --positions ", snpsToKeep,
" --not-chr Pf3D7_API_v3",
" --recode --recode-INFO-all --out ",
filtered.vcf))
file.remove(Genotypes, samplesToKeep, snpsToKeep)
system(paste0("mv ", filtered.vcf, ".recode.vcf ", filtered.vcf, ".vcf"))
system(paste0("bgzip ", filtered.vcf, ".vcf"))
system(paste0("tabix ", filtered.vcf, ".vcf.gz"))