Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrote textrank_sentences() #7

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: textrank
Type: Package
Title: Summarize Text by Ranking Sentences and Finding Keywords
Version: 0.3.0
Version: 0.3.1
Maintainer: Jan Wijffels <[email protected]>
Author: Jan Wijffels [aut, cre, cph], BNOSAC [cph]
Authors@R: c(person('Jan', 'Wijffels', role = c('aut', 'cre', 'cph'), email = '[email protected]'), person('BNOSAC', role = 'cph'))
Expand All @@ -13,5 +13,5 @@ URL: https://github.com/bnosac/textrank
Encoding: UTF-8
Imports: utils, data.table (>= 1.9.6), igraph, digest
Suggests: textreuse, knitr, udpipe (>= 0.2)
RoxygenNote: 6.0.1
RoxygenNote: 6.1.1
VignetteBuilder: knitr
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ importFrom(data.table,data.table)
importFrom(data.table,rbindlist)
importFrom(data.table,setDF)
importFrom(data.table,setDT)
importFrom(data.table,setcolorder)
importFrom(data.table,setkeyv)
importFrom(digest,digest)
importFrom(igraph,graph_from_data_frame)
importFrom(igraph,page_rank)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# CHANGES IN textrank VERSION 0.3.1

- Speedup textrank_sentences (thanks to @emillykkejensen)

# CHANGES IN textrank VERSION 0.3.0

- Speedup textrank_candidates_all (thanks to @emillykkejensen)
Expand Down
2 changes: 1 addition & 1 deletion R/pkg.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#' @importFrom utils head combn
#' @importFrom igraph graph_from_data_frame page_rank
#' @importFrom data.table rbindlist as.data.table setDF setDT data.table ":=" ".N"
#' @importFrom data.table rbindlist as.data.table setDF setDT data.table ":=" ".N" setkeyv setcolorder
#' @importFrom digest digest
NULL
92 changes: 55 additions & 37 deletions R/textrank.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ textrank_candidates_all <- function(x){
})
}
candidates <- data.table::rbindlist(candidates)
candidates <- setDF(candidates)
candidates <- data.table::setDF(candidates)
candidates
}

Expand Down Expand Up @@ -149,6 +149,7 @@ textrank_candidates_all <- function(x){
#' @param max integer indicating to reduce the number of sentence to sentence combinations to compute.
#' In case provided, we take only this max amount of rows from \code{textrank_candidates}
#' @param options_pagerank a list of arguments passed on to \code{\link[igraph]{page_rank}}
#' @param trace logical indicating to show the progress of the textrank_sentences function. Can also be a positive integer in which case it will print the progress for every \code{trace} number of sentences.
#' @param ... arguments passed on to \code{textrank_dist}
#' @seealso \code{\link[igraph]{page_rank}}, \code{\link{textrank_candidates_all}}, \code{\link{textrank_candidates_lsh}}, \code{\link{textrank_jaccard}}
#' @return an object of class textrank_sentences
Expand Down Expand Up @@ -185,60 +186,77 @@ textrank_candidates_all <- function(x){
#' textrank_candidates = candidates)
#' summary(tr, n = 2)
#' }
#'
#' ## You can also reduce the number of sentence combinations by sampling
#' tr <- textrank_sentences(data = sentences, terminology = terminology, max = 100)
#' tr <- textrank_sentences(data = sentences, terminology = terminology, max = 100, trace = 10)
#' tr
#' summary(tr, n = 2)
textrank_sentences <- function(data, terminology,
textrank_dist = textrank_jaccard,
textrank_candidates = textrank_candidates_all(data$textrank_id),
max = 1000,
options_pagerank = list(directed = FALSE),
...){
textrank_id <- NULL

stopifnot(sum(duplicated(data[, 1])) == 0)
data <- as.data.frame(data)
stopifnot(nrow(data) > 1)
data.table::setnames(data, old = colnames(data)[1:2], new = c("textrank_id", "sentence"))

terminology <- as.data.table(terminology)
data.table::setnames(terminology, old = colnames(terminology)[1:2], new = c("textrank_id", "term"))
data.table::setkey(terminology, "textrank_id")
textrank_dist = textrank_jaccard,
textrank_candidates = textrank_candidates_all(data$textrank_id),
max = 1000,
options_pagerank = list(directed = FALSE),
trace = FALSE,
...){
textrank_id_1 <- textrank_id_2 <- textrank_id <- term <- NULL
stopifnot(trace >= 0)
stopifnot(inherits(data, "data.frame"))
stopifnot(inherits(terminology, "data.frame"))
if(sum(duplicated(data[[1]])) > 0){
stop("The first column of data should be a sentence identifier which should not contain any duplicates")
}
## Make sure all datasets are data.tables and these are copies
data <- data.table::as.data.table(data)
data <- data.table::setnames(data, old = colnames(data)[1:2], new = c("textrank_id", "sentence"))
terminology <- data.table::as.data.table(terminology)
terminology <- data.table::setnames(terminology, old = colnames(terminology)[1:2], new = c("textrank_id", "term"))
terminology <- data.table::setkeyv(terminology, "textrank_id")
stopifnot(inherits(textrank_candidates, "data.frame"))
stopifnot(all(c("textrank_id_1", "textrank_id_2") %in% colnames(textrank_candidates)))
textrank_candidates <- data.table::as.data.table(textrank_candidates)
if(!missing(max)){
max <- min(nrow(textrank_candidates), max)
textrank_candidates <- textrank_candidates[sample.int(n = nrow(textrank_candidates), size = max), ]
}
textrank_candidates <- data.table::setkeyv(textrank_candidates, "textrank_id_1")

## Calculate pairwise distances along all sentence combinations
sentence_dist <- function(id1, id2, distFUN, ...){
data1 <- terminology[textrank_id == id1, ]
data2 <- terminology[textrank_id == id2, ]
if(nrow(data1) == 0 || nrow(data2) == 0){
w <- 0
}else{
w <- distFUN(data1$term, data2$term, ...)
sentence_dist <- function(i){
if(trace){
if((i %% trace) == 0){
cat(sprintf("%s Calculating on textrank_id_1 %s/%s", Sys.time(), i, sentence_ids_n_left), sep = "\n")
}
}
data.frame(
textrank_id_1 = id1,
textrank_id_2 = id2,
weight = w,
stringsAsFactors = FALSE)
sentence_id_left <- sentence_ids_left[i]
id_right <- textrank_candidates[textrank_id_1 == sentence_id_left, textrank_id_2]
tokens_left <- terminology[textrank_id == sentence_id_left, term]
result <- terminology[textrank_id %in% id_right, list(weight = textrank_dist(tokens_left, term, ...)), by = list(textrank_id_2 = textrank_id)]
result <- result[, textrank_id_1 := sentence_id_left]
result
}
sent2sent_distance <- as.data.frame(textrank_candidates)
if(!missing(max)){
max <- min(nrow(sent2sent_distance), max)
sent2sent_distance <- sent2sent_distance[sample.int(n = nrow(sent2sent_distance), size = max), ]
sentence_ids_left <- unique(textrank_candidates$textrank_id_1)
sentence_ids_n_left <- length(sentence_ids_left)
if(trace){
cat(sprintf("%s Start calculating %s pairwise sentence comparisons", Sys.time(), nrow(textrank_candidates)), sep = "\n")
}
sent2sent_distance <- mapply(id1 = sent2sent_distance$textrank_id_1,
id2 = sent2sent_distance$textrank_id_2, FUN = sentence_dist, MoreArgs = list(distFUN = textrank_dist, ...),
SIMPLIFY = FALSE)
sent2sent_distance <- lapply(seq_along(sentence_ids_left), sentence_dist)
sent2sent_distance <- data.table::rbindlist(sent2sent_distance)
sent2sent_distance <- data.table::setcolorder(sent2sent_distance, neworder = c("textrank_id_1", "textrank_id_2", "weight"))
sent2sent_distance <- data.table::setDF(sent2sent_distance)

## Calculate pagerank
if(trace){
cat(sprintf("%s Start calculating pagerank", Sys.time()), sep = "\n")
}
pr <- igraph::graph_from_data_frame(sent2sent_distance, directed = FALSE)
options_pagerank$graph <- pr
pr <- do.call(igraph::page_rank, options_pagerank)

## Add pagerank to sentences for having it in the ouput
data <- data.table::setDF(data)
data <- merge(data,
data.frame(textrank_id = names(pr$vector), textrank = as.numeric(pr$vector), stringsAsFactors = FALSE),
data.frame(textrank_id = names(pr$vector),
textrank = as.numeric(pr$vector), stringsAsFactors = FALSE),
by = "textrank_id", all.x=TRUE, all.y=FALSE, sort = FALSE, suffixes = c("sent.", ""))

result <- list(
Expand Down
8 changes: 6 additions & 2 deletions man/textrank_sentences.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.