forked from kingaa/sbied
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwordcloud.R
67 lines (54 loc) · 2.27 KB
/
wordcloud.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
library(tidyverse)
library(wordcloud2)
library(tm)
library(SnowballC)
library(stringr)
library(pdftools)
list.files(
path=c("intro","stochsim","pfilter","mif","measles",
"od","contacts","polio","ebola","papers"),
pattern=r"{.*\.pdf}",recursive=TRUE
) -> files
lapply(files, function(f) {
pdf_text(f)
}) %>% unlist -> text
text %>%
str_replace_all("[[:punct:]]", " ") -> text
Corpus(VectorSource(text)) -> docs
# inspect(docs)
toSpace <- content_transformer(function (x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
# Convert the text to lower case
# docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
# head(d, 10)
d[d$word=="parameter",2] <- d[d$word=="parameter",2] + d[d$word=="parameters",2]
d[d$word=="rate",2] <- d[d$word=="rate",2] + d[d$word=="rates",2]
d[d$word=="model",2] <- d[d$word=="model",2] + d[d$word=="models",2]
newrows <- rbind(c("monte carlo", d[d$word=="monte",2]+d[d$word=="carlo",2]))
names(newrow) <- c("monte carlo")
d <- rbind(d, newrow)
d$freq <- as.numeric(d$freq)
d <- d %>% arrange(-freq)
## remove words manually
d <- d[!d$word %in% c("parameters", "the", "can", "using", "cases", "function", "use", "this", "results", "one", "−",
"rates", "values", "error", "set", "value", "rho", "beta", "eta", "mu", "run", "units", "may",
"models", "mean", "ionides", "∗", "csv", "also", "lesson", "king", "small", "var", "much", "version",
"will", "xn−", "used", "called", "fxn", "via", "what", "following", "monte", "carlo",
"how", "first", "n−", "fyn", "two", "measir", "for", "library", "non", "see", "doi"),]
wordcloud2(d, minRotation = 0, maxRotation = 0, minSize = 5,
rotateRatio = 1,color = "random-light", backgroundColor = "grey")