-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerateLogos.R
170 lines (138 loc) · 6.89 KB
/
generateLogos.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# Install and load required packages
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(c("GenomicFeatures", "BSgenome.Hsapiens.UCSC.hg38", "Biostrings", "ggseqlogo","RMariaDB","txdbmaker"))
library(RMariaDB)
library(GenomicFeatures)
library(BSgenome.Hsapiens.UCSC.hg38)
library(Biostrings)
library(ggseqlogo)
library(txdbmaker)
library(ggplot2)
library(gridExtra)
library(dplyr)
# Download the UCSC RefSeq gene track
txdb <- makeTxDbFromUCSC(genome = "hg38", tablename = "ncbiRefSeq")
# Define canonical chromosomes
canonical_chr <- paste0("chr", c(1:22, "X", "Y", "M"))
# Get all transcripts with their lengths
all_transcripts <- transcripts(txdb, columns = c("tx_id", "tx_name", "gene_id"))
all_transcripts$tx_length <- width(all_transcripts)
# Function to identify protein-coding transcripts based on RefSeq accession
is_protein_coding <- function(tx_name) {
startsWith(tx_name, "NM_")
}
# Filter for protein-coding genes
protein_coding_transcripts <- all_transcripts[is_protein_coding(all_transcripts$tx_name)]
# Select the longest transcript per protein-coding gene
longest_per_gene <- protein_coding_transcripts %>%
as.data.frame() %>%
group_by(gene_id) %>%
slice_max(order_by = tx_length, n = 1) %>%
pull(tx_name)
# Extract exons for the longest transcripts
exons_by_tx <- exonsBy(txdb, by = "tx", use.names = TRUE)
longest_tx_exons <- exons_by_tx[names(exons_by_tx) %in% longest_per_gene]
# Process internal exons more efficiently
internal_exons <- unlist(longest_tx_exons)
exon_ranks <- rle(names(internal_exons))$lengths
is_internal <- rep(FALSE, length(internal_exons))
is_internal[cumsum(exon_ranks[-length(exon_ranks)]) + 1] <- TRUE
is_internal[cumsum(exon_ranks)] <- TRUE
internal_exons <- internal_exons[!is_internal]
# Filter for canonical chromosomes
internal_exons <- internal_exons[seqnames(internal_exons) %in% canonical_chr]
# Extract splice sites
five_ss <- GRanges()
three_ss <- GRanges()
# For positive strand
pos_exons <- internal_exons[strand(internal_exons) == "+"]
five_ss <- c(five_ss, resize(pos_exons, width = 1, fix = "end"))
three_ss <- c(three_ss, resize(pos_exons, width = 1, fix = "start"))
# For negative strand
neg_exons <- internal_exons[strand(internal_exons) == "-"]
five_ss <- c(five_ss, resize(neg_exons, width = 1, fix = "start"))
three_ss <- c(three_ss, resize(neg_exons, width = 1, fix = "end"))
get_ss_seq <- function(ss, genome, exon_nt, intron_nt, is_five_prime = TRUE) {
pos_strand <- strand(ss) == "+"
if (is_five_prime) {
# Correct start and end positions for the 5' splice site
start_pos <- ifelse(pos_strand,
start(ss) - exon_nt + 1,
end(ss) - intron_nt + 1)
end_pos <- ifelse(pos_strand,
start(ss) + intron_nt,
end(ss) + exon_nt)
} else {
# Correct start and end positions for the 3' splice site
start_pos <- ifelse(pos_strand,
start(ss) - intron_nt,
end(ss) - exon_nt)
end_pos <- ifelse(pos_strand,
start(ss) + exon_nt - 1,
end(ss) + intron_nt - 1)
}
# Ensure proper genomic ranges
ss_ranges <- GRanges(seqnames(ss),
IRanges(start_pos, end_pos),
strand = strand(ss))
ss_seq <- getSeq(genome, ss_ranges)
return(ss_seq)
}
# Get splice site sequences
five_ss_seq <- get_ss_seq(five_ss, Hsapiens, 3, 6, is_five_prime = TRUE)
three_ss_seq <- get_ss_seq(three_ss, Hsapiens, 3, 8, is_five_prime = FALSE)
# Convert to character vectors
five_ss <- as.character(five_ss_seq)
three_ss <- as.character(three_ss_seq)
# Filter for canonical splice sites
five_ss_canonical <- five_ss[substr(five_ss, 4, 5) == "GT"]
three_ss_canonical <- three_ss[substr(three_ss, 7, 8) == "AG"]
# Generate sequence logos with axis lines for both x and y axes, and y-axis limited to 1
if (length(five_ss_canonical) > 0 && length(three_ss_canonical) > 0) {
pdf("canonical_splice_site_logos_protein_coding.pdf", width = 15, height = 5)
# 5'ss logo
p1 <- ggseqlogo(five_ss_canonical, method = "prob", seq_type = "dna") +
theme(plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 18),
axis.title = element_text(size = 18),
panel.grid = element_blank(), # Remove grid lines
panel.border = element_blank(), # Remove plot border
axis.line = element_line(color = "black", size = 0.8)) + # Add black axis lines
xlab("Position relative to splice site") +
ylab("Probability") +
scale_x_continuous(breaks = 1:9, labels = c(-3:-1, 0, 1:5)) +
geom_vline(xintercept = 3.5, linetype = "dotted", color = "black", size = 1) + # Thicker black dotted line
coord_cartesian(ylim = c(0, 1)) # Limit y-axis to 1
# 3'ss logo
p2 <- ggseqlogo(three_ss_canonical, method = "prob", seq_type = "dna") +
theme(plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 18),
axis.title = element_text(size = 18),
panel.grid = element_blank(), # Remove grid lines
panel.border = element_blank(), # Remove plot border
axis.line = element_line(color = "black", size = 0.8)) + # Add black axis lines
xlab("Position relative to splice site") +
ylab("Probability") +
scale_x_continuous(breaks = 1:11, labels = c(-7:-1, 0, 1:3)) + # Corrected labels for 3'ss
geom_vline(xintercept = 8.5, linetype = "dotted", color = "black", size = 1) + # Thicker black dotted line
coord_cartesian(ylim = c(0, 1)) # Limit y-axis to 1
# Arrange plots side by side with different widths
grid.arrange(p1, p2, ncol = 2, widths = c(2, 3))
dev.off()
print("Canonical sequence logos for protein-coding genes have been generated and saved as 'canonical_splice_site_logos_protein_coding.pdf'")
} else {
cat("\nError: Unable to generate sequence logos due to lack of canonical splice sites.\n")
}
# Print some statistics
cat("\nTotal genes:", length(unique(all_transcripts$gene_id)), "\n")
cat("Total transcripts:", length(all_transcripts), "\n")
cat("Longest transcripts selected:", length(longest_per_gene), "\n")
cat("Total internal exons:", length(internal_exons), "\n")
cat("Total 5' splice sites:", length(five_ss), "\n")
cat("Canonical 5' splice sites (GT):", length(five_ss_canonical), "\n")
cat("Percentage of canonical 5' splice sites:", round(length(five_ss_canonical) / length(five_ss) * 100, 2), "%\n\n")
cat("Total 3' splice sites:", length(three_ss), "\n")
cat("Canonical 3' splice sites (AG):", length(three_ss_canonical), "\n")
cat("Percentage of canonical 3' splice sites:", round(length(three_ss_canonical) / length(three_ss) * 100, 2), "%\n")
print("Canonical sequence logos have been generated and saved as 'canonical_splice_site_logos.pdf'")