forked from pnandak/text_mining
-
Notifications
You must be signed in to change notification settings - Fork 0
/
page_rank_20131121.R
146 lines (118 loc) · 5.04 KB
/
page_rank_20131121.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#文本挖掘相关函数
wordstyle <- function(x) { #输入分词后的文本list,输出每个词性对应的词
x <- unlist(x)
x <-as.data.frame( cbind( x, names(x)))
row.names(x) <- NULL
y <- unique( as.character( x[,2]) )
for ( i in 1: length(y) ) {
z <- unique( as.character( x[ x$V2 == y[i] , 1] ) ) [1:10]
cat( y[i] , z, "\n")
} #end for
}
charclear <- function(x) { #去除标点,分词,以及多余空格
library(tm)
x <- as.character( x )
x <- sub( '^c' , "", x) #分词结束,列表转向量的时候剩下了c(/)
x <- Corpus( VectorSource( x))
x <- tm_map(x, tolower)#小写规范
x <- tm_map(x, removePunctuation)#除标点
x <- tm_map(x, stripWhitespace)#移除多余空格
x <- as.character(x)
}
#词性过滤
txt.reg <- sapply( txt.reg , function(x) #
x[ names(x)!="uj" & names(x)!="c" & names(x)!="y" & names(x)!="e" & names(x)!="ud"
& names(x)!="ul" & names(x)!="p" & names(x)!="o" & names(x)!="uv" & names(x)!="uz"
& names(x)!="d" & names(x)!="m" & names(x)!="en"
& names(x)!="f" & names(x)!="y" & names(x)!="u" & names(x)!="ug" & names(x)!="k"
])
################################################################3
charclear <- function(x) { #去除标点,分词,以及多余空格
library(tm)
x <- as.character( x )
x <- sub( '^c' , "", x) #分词结束,列表转向量的时候剩下了c(/)
x <- Corpus( VectorSource( x))
x <- tm_map(x, tolower)#小写规范
x <- tm_map(x, removePunctuation)#除标点
x <- tm_map(x, stripWhitespace)#移除多余空格
x <- as.character(x)
}
# textrank
term.rank <- function(x) {
require(tau)
#顺序敏感的词共现
vote <- textcnt( x, split=" ", method = "string", n=2L, decreasing = T)
vnn <- strsplit( names(vote) , split=" ")
v1 <- sapply( vnn, function(x) x[1])
v2 <- sapply( vnn, function(x) x[2])
vote <-as.data.frame( cbind( v1, v2, vote), row.names=F)
names(vote) <- c("kw1", "kw2" , "freq")
#vote <- vote[ vote[,1] %in% names(atf) & vote[,2] %in% names(tf), ] #atfpdf
vote <- vote
}
wordstyle <- function(x) { #输入分词后的文本list,输出每个词性对应的词
x <- unlist(x)
x <-as.data.frame( cbind( x, names(x)))
row.names(x) <- NULL
y <- unique( as.character( x[,2]) )
for ( i in 1: length(y) ) {
z <- unique( as.character( x[ x$V2 == y[i] , 1] ) ) [1:10]
cat( y[i] , z, "\n")
} #end for
}
xxx <- function(x) {
# 计算rank系数至收敛
### Importing data as a matrix with columns ["KW1", "KW2", "Freq"] ###
### Here the following matrix "a" is considered as an directed graph,
### which means rows [x,y,n] and [y,x,n] have different directions.
### A row [x,y,n] means there is a link from x to y with weight n.
# NOTICE: If "a" is NOT directed, run the following line
#trdata = rbind(trdata, cbind(trdata[,2],trdata[,1],trdata[,3]));
### Constructing data describing the network ###
# sortedOriginalData = a[order(a[,1],a[,2]),]; # not needed
vertexData = unique(as.vector(trdata[,c(1,2)])); # This is a vector of vertices
vertexDataID = cbind(vertexData, vID=1:length(vertexData)); # This is a vector of vertices together with a vector of their IDs
# An ID version of matrix "a":
aID = matrix(, nrow = nrow(trdata), ncol = ncol(trdata));
for (i in 1:nrow(trdata)) {
aID[i,1] = vertexDataID[vertexDataID[,1] == trdata[i,1],2];
aID[i,2] = vertexDataID[vertexDataID[,1] == trdata[i,2],2];
aID[i,3] = trdata[i,3];
}
inListID = list(); # list of predecessors' IDs of each vertex
for (i in 1:length(vertexData)) {
inListID[[i]] = aID[aID[,2] == vertexDataID[i,2],1];
}
inListW = list(); # list of predecessors' weights of each vertex
for (i in 1:length(vertexData)) {
inListW[[i]] = aID[aID[,2] == vertexDataID[i,2],3];
}
# A list of the sum of outgoing link weights of each vertex:
outSum = array(0, dim = length(vertexData));
for (i in 1:length(vertexData)) {
outSum[i] = sum(as.integer(trdata [ trdata [,1]==vertexData[i],3]));
}
### Evaluating the weighted score vector ###
d = 0.85; # damping factor
epsilon = 0.000001; # precision of convergence
maxIteration = 100; # maximum number of iteration steps
WS = array(1, dim = length(vertexData)); # weighted score vector
errorSeries = array(0, dim = maxIteration); # error series
for (n in 1:maxIteration) {
WS_temp = array(0, dim = length(vertexData)); # temporary weighted score vector
for (i in 1:length(vertexData)) {
WS_temp[i] = (1-d) + d * ((as.integer(inListW[[i]]) / outSum[as.integer(inListID[[i]])]) %*% WS[as.integer(inListID[[i]])]);
}
errorSeries[n] = max(abs(WS - WS_temp));
WS = WS_temp;
if (errorSeries[n] < epsilon) {break;}
}
### Output as a ranking of words ###
scoredData = as.data.frame(cbind(vertexData,WS)); # combining the keywords and their scores
sortedData = scoredData[order(WS, decreasing=TRUE),]; # sorting.
sortedData
# sortedData is the full ranked keywords list
rm(aID, b, scoredData, trdata, vertexDataID, WS, WS_temp, con, d , dtm,
epsilon, errorSeries, i, inListID, inListW, maxIteration, n, outSum , t, tr.abs,
vertexData, vote.n1, vote.n2, vote.name, x)
}