-
Notifications
You must be signed in to change notification settings - Fork 9
/
004_generate_topic_model.r
149 lines (125 loc) · 6.02 KB
/
004_generate_topic_model.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#' Topic modelling with MALLET using clean fulltext
#' based on http://www.cs.princeton.edu/~mimno/R/
require(mallet)
documents <- data.frame(text = blogtext$text,
id = make.unique(blogtext$author),
class = blogtext$year,
stringsAsFactors=FALSE)
mallet.instances <- mallet.import(documents$id, documents$text, "C:/mallet-2.0.7/stoplists/en.txt", token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
## Create a topic trainer object.
n.topics <- 30
topic.model <- MalletLDA(n.topics)
## Load our documents. We could also pass in the filename of a
## saved instance list file that we build from the command-line tools.
topic.model$loadDocuments(mallet.instances)
## Get the vocabulary, and some statistics about word frequencies.
## These may be useful in further curating the stopword list.
vocabulary <- topic.model$getVocabulary()
word.freqs <- mallet.word.freqs(topic.model)
## Optimize hyperparameters every 20 iterations,
## after 50 burn-in iterations.
topic.model$setAlphaOptimization(20, 50)
## Now train a model. Note that hyperparameter optimization is on, by default.
## We can specify the number of iterations. Here we'll use a large-ish round number.
topic.model$train(200)
## NEW: run through a few iterations where we pick the best topic for each token,
## rather than sampling from the posterior distribution.
topic.model$maximize(10)
## Get the probability of topics in documents and the probability of words in topics.
## By default, these functions return raw word counts. Here we want probabilities,
## so we normalize, and add "smoothing" so that nothing has exactly 0 probability.
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)
# from http://www.cs.princeton.edu/~mimno/R/clustertrees.R
## transpose and normalize the doc topics
topic.docs <- t(doc.topics)
topic.docs <- topic.docs / rowSums(topic.docs)
## Get a vector containing short names for the topics
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
# create data.frame with columns as authors and rows as topics
topic_docs <- data.frame(topic.docs)
names(topic_docs) <- documents$id
# find top n topics for a certain author
df1 <- t(topic_docs[,grep("Sarah Bennett", names(topic_docs))])
colnames(df1) <- topics.labels
require(reshape2)
topic.proportions.df <- melt(cbind(data.frame(df1),
document=factor(1:nrow(df1))),
variable.name="topic",
id.vars = "document")
# plot for each doc by that author
require(ggplot2)
ggplot(topic.proportions.df, aes(topic, value, fill=document)) +
geom_bar(stat="identity") +
ylab("proportion") +
theme(axis.text.x = element_text(angle=90, hjust=1)) +
coord_flip() +
facet_wrap(~ document, ncol=5)
## cluster based on shared words
plot(hclust(dist(topic.words)), labels=topics.labels)
## How do topics differ across different years?
topic_docs_t <- data.frame(t(topic_docs))
topic_docs_t$year <- documents$class
# now we have a data frame where each row is a topic and
# each column is a document. The cells contain topic
# proportions. The next line computes the average proportion of
# each topic in all the posts in a given year. Note that in
# topic_docs_t$year there is one FALSE, which dirties the data
# slightly and causes warnings
df3 <- aggregate(topic_docs_t, by=list(topic_docs_t$year), FUN=mean)
# this next line transposes the wide data frame created by the above
# line into a tall data frame where each column is a year. The
# input data frame is subset using the %in% function
# to omit the last row because this
# last row is the result of the anomalous FALSE value that
# is in place of the year for one blog post. This is probably
# a result of a glitch in the blog page format. I also exclude
# the last column because it has NAs in it, a side-effect of the
# aggregate function above. Here's my original line:
# df3 <- data.frame(t(df3[-3,-length(df3)]), stringsAsFactors = FALSE)
# And below is an updated version that generalises this in case
# you have more than two years:
years <- sort(as.character(na.omit(as.numeric(as.character(unique(topic_docs_t$year))))))
df3 <- data.frame(t(df3[(df3$Group.1 %in% years),-length(df3)]), stringsAsFactors = FALSE)
# now we put on informative column names
# names(df3) <- c("y2012", "y2013")
# Here's a more general version in case you have more than two years
# or different years to what I've got:
names(df3) <- unname(sapply(years, function(i) paste0("y",i)))
# the next line removes the first row, which is just the years
df3 <- df3[-1,]
# the next line converts all the values to numbers so we can
# work on them
df3 <- data.frame(apply(df3, 2, as.numeric, as.character))
df3$topic <- 1:n.topics
# which topics differ the most between the years?
# If you have
# more than two years you will need to do things differently
# by adding in some more pairwise comparisons. Here is one
# pairwise comparison:
df3$diff <- df3[,1] - df3[,2]
df3[with(df3, order(-abs(diff))), ]
# # then if you had three years you might then do
# # a comparison of yrs 1 and 3
# df3$diff2 <- df3[,1] - df3[,3]
# df3[with(df3, order(-abs(diff2))), ]
# # and the other pairwise comparison of yrs 2 and 3
# df3$diff3 <- df3[,2] - df3[,3]
# df3[with(df3, order(-abs(diff3))), ]
## and so on
# plot
library(reshape2)
# we reshape from long to very long! and drop the
# 'diff' column that we computed above by using a negatve
# index, that's the -4 in the line below. You'll need to change
# that value if you have more than two years, you might find
# replacing it with -ncol(df3) will do the trick, if you just
# added one diff column.
df3m <- melt(df3[,-4], id = 3)
ggplot(df3m, aes(fill = as.factor(topic), topic, value)) +
geom_bar(stat="identity") +
coord_flip() +
facet_wrap(~ variable)