forked from pnandak/text_mining
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_word_research_20131101.R
120 lines (94 loc) · 4.36 KB
/
test_word_research_20131101.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#基于词序的关键词提取
#生成路径(边、投票、link)集E-矩阵
require(tau)
tr.abs <- x
vote <- as.data.frame( as.matrix( textcnt( tr.abs[1], split = ' ', method = 'string', n=2L) )) #两两组词
for ( i in 2:length(tr.abs)) {
b <- as.data.frame( as.matrix( textcnt( tr.abs[i], split = ' ', method = 'string', n=2L) ))
vote <- rbind(b, vote)
} #所有文档的词对和频数堆在一起
#in 和out的不加区分
vote.name <- gsub("[0-9]", "", row.names(vote) ) #重复的词被标记了不同数字,要把数字去掉
vote.name <- strsplit( vote.name , ' ')
vote.n1 <- unlist( lapply(vote.name, function(x) x[1]))
vote.n2 <- unlist( lapply(vote.name, function(x) x[2]))
row.names(vote) <- NULL
vote <- cbind( vote.n1,vote.n2, vote)
names(vote) <- c("kw1", "kw2" , "freq")
vote <- vote[which(vote[,1] != '' & vote[,2] != ''),]
vote <- vote[ vote[,1] %in% names(abs.atfpdf) & vote[,2] %in% names(abs.atfpdf), ] #atfpdf
head(vote)
trdata <- as.matrix( vote)
# 计算rank系数至收敛
### Importing data as a matrix with columns ["KW1", "KW2", "Freq"] ###
### Here the following matrix "a" is considered as an directed graph,
### which means rows [x,y,n] and [y,x,n] have different directions.
### A row [x,y,n] means there is a link from x to y with weight n.
# NOTICE: If "a" is NOT directed, run the following line
trdata = rbind(trdata, cbind(trdata[,2],trdata[,1],trdata[,3]));
### Constructing data describing the network ###
# sortedOriginalData = a[order(a[,1],a[,2]),]; # not needed
vertexData = unique(as.vector(trdata[,c(1,2)])); # This is a vector of vertices
vertexDataID = cbind(vertexData, vID=1:length(vertexData)); # This is a vector of vertices together with a vector of their IDs
# An ID version of matrix "a":
aID = matrix(, nrow = nrow(trdata), ncol = ncol(trdata));
for (i in 1:nrow(trdata)) {
aID[i,1] = vertexDataID[vertexDataID[,1] == trdata[i,1],2];
aID[i,2] = vertexDataID[vertexDataID[,1] == trdata[i,2],2];
aID[i,3] = trdata[i,3];
}
inListID = list(); # list of predecessors' IDs of each vertex
for (i in 1:length(vertexData)) {
inListID[[i]] = aID[aID[,2] == vertexDataID[i,2],1];
}
inListW = list(); # list of predecessors' weights of each vertex
for (i in 1:length(vertexData)) {
inListW[[i]] = aID[aID[,2] == vertexDataID[i,2],3];
}
# A list of the sum of outgoing link weights of each vertex:
outSum = array(0, dim = length(vertexData));
for (i in 1:length(vertexData)) {
outSum[i] = sum(as.integer(trdata [ trdata [,1]==vertexData[i],3]));
}
### Evaluating the weighted score vector ###
d = 0.85; # damping factor
epsilon = 0.000001; # precision of convergence
maxIteration = 100; # maximum number of iteration steps
WS = array(1, dim = length(vertexData)); # weighted score vector
errorSeries = array(0, dim = maxIteration); # error series
for (n in 1:maxIteration) {
WS_temp = array(0, dim = length(vertexData)); # temporary weighted score vector
for (i in 1:length(vertexData)) {
WS_temp[i] = (1-d) + d * ((as.integer(inListW[[i]]) / outSum[as.integer(inListID[[i]])]) %*% WS[as.integer(inListID[[i]])]);
}
errorSeries[n] = max(abs(WS - WS_temp));
WS = WS_temp;
if (errorSeries[n] < epsilon) {break;}
}
### Output as a ranking of words ###
scoredData = as.data.frame(cbind(vertexData,WS)); # combining the keywords and their scores
sortedData = scoredData[order(WS, decreasing=TRUE),]; # sorting.
sortedData
# sortedData is the full ranked keywords list
rm(aID, b, dtm.data, scoredData, trdata, vertexDataID, vote, WS, WS_temp, con, d , dtm,
epsilon, errorSeries, i, inListID, inListW, maxIteration, n, outSum , t, tr.abs,
vertexData, vote.n1, vote.n2, vote.name, x)
#几种方法得到的关键词
names(atf)[1:10]
names(tf)[1:10]
sortedData[1:10,1]
#取合集,得到标签超集(标签集)
i <- 200
lables0 <- names(atf) [names(atf)[1:i] %in% names(tf)[1:i]]
lables0 <- lables0[ lables0 %in% sortedData[1:i,1] ]
#标签集并dtm变量集
out1 <- dtm.data[, names(dtm.data) %in% lables0]
##########关键词的可视化展示
library(sqldf)
vote.stat <- sqldf(" select kw1,kw2,sum(freq) as n from vote
group by kw1,kw2 having kw1 != '' and sum(freq) >1 order by sum(freq) desc")
require(igraph)
#滤掉关系不大的词
vote1 <- vote[ vote[,1] %in% names(abs.atfpdf[1:50]) & vote[,2] %in% names(abs.atfpdf[1:50]), ] #atfpdf
g <- graph.data.frame( vote1 , directed =F)
plot(g, edge.width = E(g)$weight)