-
Notifications
You must be signed in to change notification settings - Fork 0
/
Text_Mining.R
59 lines (34 loc) · 1.21 KB
/
Text_Mining.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
install.packages('tm')
library(tm)
docs<- Corpus(DirSource('data'))
inspect(docs)
writeLines(as.character(docs[30]))
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, ":")
docs <- tm_map(docs, toSpace, "'")
docs <- tm_map(docs, toSpace, "'")
docs <- tm_map(docs, toSpace, " -")
writeLines(as.character(docs[30]))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, stripWhitespace)
writeLines(as.character(docs[30]))
install.packages('SnowballC')
library(SnowballC)
docs <- tm_map(docs, stemDocument)
writeLines(as.character(docs[30]))
dtm <- DocumentTermMatrix(docs)
freq <- colSums(as.matrix(dtm))
length(freq)
ord <- order(freq, decreasing = TRUE)
freq[head(ord)]
freq[tail(ord)]
dtmr <- DocumentTermMatrix(docs, control=list(wordLengths=c(4, 20), bounds = list(global = c(3, 27))))
install.packages("wordcloud")
library(wordcloud)
set.seed(42)
wordcloud(names(freq), freq, min.freq=70)
wordcloud(names(freq), freq, min.freq=70, colors=brewer.pal(6, "Dark2"))