bnosac · emillykkejensen · Jan 21, 2019 · Feb 4, 2019 · Feb 14, 2019
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: textrank
 Type: Package
 Title: Summarize Text by Ranking Sentences and Finding Keywords
-Version: 0.3.0
+Version: 0.3.1
 Maintainer: Jan Wijffels <[email protected]>
 Author: Jan Wijffels [aut, cre, cph], BNOSAC [cph]
 Authors@R: c(person('Jan', 'Wijffels', role = c('aut', 'cre', 'cph'), email = '[email protected]'), person('BNOSAC', role = 'cph'))
@@ -13,5 +13,5 @@ URL: https://github.com/bnosac/textrank
 Encoding: UTF-8
 Imports: utils, data.table (>= 1.9.6), igraph, digest
 Suggests: textreuse, knitr, udpipe (>= 0.2)
-RoxygenNote: 6.0.1
+RoxygenNote: 6.1.1
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -14,6 +14,8 @@ importFrom(data.table,data.table)
 importFrom(data.table,rbindlist)
 importFrom(data.table,setDF)
 importFrom(data.table,setDT)
+importFrom(data.table,setcolorder)
+importFrom(data.table,setkeyv)
 importFrom(digest,digest)
 importFrom(igraph,graph_from_data_frame)
 importFrom(igraph,page_rank)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# CHANGES IN textrank VERSION 0.3.1
+
+- Speedup textrank_sentences (thanks to @emillykkejensen)
+
 # CHANGES IN textrank VERSION 0.3.0
 
 - Speedup textrank_candidates_all (thanks to @emillykkejensen)

diff --git a/R/pkg.R b/R/pkg.R
@@ -1,5 +1,5 @@
 #' @importFrom utils head combn
 #' @importFrom igraph graph_from_data_frame page_rank
-#' @importFrom data.table rbindlist as.data.table setDF setDT data.table ":=" ".N"
+#' @importFrom data.table rbindlist as.data.table setDF setDT data.table ":=" ".N" setkeyv setcolorder
 #' @importFrom digest digest
 NULL
diff --git a/R/textrank.R b/R/textrank.R
@@ -113,7 +113,7 @@ textrank_candidates_all <- function(x){
     })
   }
   candidates <- data.table::rbindlist(candidates)
-  candidates <- setDF(candidates)
+  candidates <- data.table::setDF(candidates)
   candidates
 }
 
@@ -149,6 +149,7 @@ textrank_candidates_all <- function(x){
 #' @param max integer indicating to reduce the number of sentence to sentence combinations to compute.
 #' In case provided, we take only this max amount of rows from \code{textrank_candidates}
 #' @param options_pagerank a list of arguments passed on to \code{\link[igraph]{page_rank}}
+#' @param trace logical indicating to show the progress of the textrank_sentences function. Can also be a positive integer in which case it will print the progress for every \code{trace} number of sentences.
 #' @param ... arguments passed on to \code{textrank_dist}
 #' @seealso \code{\link[igraph]{page_rank}}, \code{\link{textrank_candidates_all}}, \code{\link{textrank_candidates_lsh}}, \code{\link{textrank_jaccard}}
 #' @return an object of class textrank_sentences
@@ -185,60 +186,77 @@ textrank_candidates_all <- function(x){
 #'                          textrank_candidates = candidates)
 #' summary(tr, n = 2)
 #' }
+#'
 #' ## You can also reduce the number of sentence combinations by sampling
-#' tr <- textrank_sentences(data = sentences, terminology = terminology, max = 100)
+#' tr <- textrank_sentences(data = sentences, terminology = terminology, max = 100, trace = 10)
 #' tr
 #' summary(tr, n = 2)
 textrank_sentences <- function(data, terminology,
-                     textrank_dist = textrank_jaccard,
-                     textrank_candidates = textrank_candidates_all(data$textrank_id),
-                     max = 1000,
-                     options_pagerank = list(directed = FALSE),
-                     ...){
-  textrank_id <- NULL
-
-  stopifnot(sum(duplicated(data[, 1])) == 0)
-  data <- as.data.frame(data)
-  stopifnot(nrow(data) > 1)
-  data.table::setnames(data, old = colnames(data)[1:2], new = c("textrank_id", "sentence"))
-
-  terminology <- as.data.table(terminology)
-  data.table::setnames(terminology, old = colnames(terminology)[1:2], new = c("textrank_id", "term"))
-  data.table::setkey(terminology, "textrank_id")
+                               textrank_dist = textrank_jaccard,
+                               textrank_candidates = textrank_candidates_all(data$textrank_id),
+                               max = 1000,
+                               options_pagerank = list(directed = FALSE),
+                               trace = FALSE,
+                               ...){
+  textrank_id_1 <- textrank_id_2 <- textrank_id <- term <- NULL
+  stopifnot(trace >= 0)
+  stopifnot(inherits(data, "data.frame"))
+  stopifnot(inherits(terminology, "data.frame"))
+  if(sum(duplicated(data[[1]])) > 0){
+    stop("The first column of data should be a sentence identifier which should not contain any duplicates")
+  }
+  ## Make sure all datasets are data.tables and these are copies
+  data <- data.table::as.data.table(data)
+  data <- data.table::setnames(data, old = colnames(data)[1:2], new = c("textrank_id", "sentence"))
+  terminology <- data.table::as.data.table(terminology)
+  terminology <- data.table::setnames(terminology, old = colnames(terminology)[1:2], new = c("textrank_id", "term"))
+  terminology <- data.table::setkeyv(terminology, "textrank_id")
+  stopifnot(inherits(textrank_candidates, "data.frame"))
+  stopifnot(all(c("textrank_id_1", "textrank_id_2") %in% colnames(textrank_candidates)))
+  textrank_candidates <- data.table::as.data.table(textrank_candidates)
+  if(!missing(max)){
+    max <- min(nrow(textrank_candidates), max)
+    textrank_candidates <- textrank_candidates[sample.int(n = nrow(textrank_candidates), size = max), ]
+  }
+  textrank_candidates <- data.table::setkeyv(textrank_candidates, "textrank_id_1")
 
   ## Calculate pairwise distances along all sentence combinations
-  sentence_dist <- function(id1, id2, distFUN, ...){
-    data1 <- terminology[textrank_id == id1, ]
-    data2 <- terminology[textrank_id == id2, ]
-    if(nrow(data1) == 0 || nrow(data2) == 0){
-      w <- 0
-    }else{
-      w <- distFUN(data1$term, data2$term, ...)
+  sentence_dist <- function(i){
+    if(trace){
+      if((i %% trace) == 0){
+        cat(sprintf("%s Calculating on textrank_id_1 %s/%s", Sys.time(), i, sentence_ids_n_left), sep = "\n")
+      }
     }
-    data.frame(
-      textrank_id_1 = id1,
-      textrank_id_2 = id2,
-      weight = w,
-      stringsAsFactors = FALSE)
+    sentence_id_left <- sentence_ids_left[i]
+    id_right <- textrank_candidates[textrank_id_1 == sentence_id_left, textrank_id_2]
+    tokens_left <- terminology[textrank_id == sentence_id_left, term]
+    result <- terminology[textrank_id %in% id_right, list(weight = textrank_dist(tokens_left, term, ...)), by = list(textrank_id_2 = textrank_id)]
+    result <- result[, textrank_id_1 := sentence_id_left]
+    result
   }
-  sent2sent_distance <- as.data.frame(textrank_candidates)
-  if(!missing(max)){
-    max <- min(nrow(sent2sent_distance), max)
-    sent2sent_distance <- sent2sent_distance[sample.int(n = nrow(sent2sent_distance), size = max), ]
+  sentence_ids_left <- unique(textrank_candidates$textrank_id_1)
+  sentence_ids_n_left <- length(sentence_ids_left)
+  if(trace){
+    cat(sprintf("%s Start calculating %s pairwise sentence comparisons", Sys.time(), nrow(textrank_candidates)), sep = "\n")
   }
-  sent2sent_distance <- mapply(id1 = sent2sent_distance$textrank_id_1,
-                               id2 = sent2sent_distance$textrank_id_2, FUN = sentence_dist, MoreArgs = list(distFUN = textrank_dist, ...),
-                               SIMPLIFY = FALSE)
+  sent2sent_distance <- lapply(seq_along(sentence_ids_left), sentence_dist)
   sent2sent_distance <- data.table::rbindlist(sent2sent_distance)
+  sent2sent_distance <- data.table::setcolorder(sent2sent_distance, neworder = c("textrank_id_1", "textrank_id_2", "weight"))
+  sent2sent_distance <- data.table::setDF(sent2sent_distance)
 
   ## Calculate pagerank
+  if(trace){
+    cat(sprintf("%s Start calculating pagerank", Sys.time()), sep = "\n")
+  }
   pr <- igraph::graph_from_data_frame(sent2sent_distance, directed = FALSE)
   options_pagerank$graph <- pr
   pr <- do.call(igraph::page_rank, options_pagerank)
 
   ## Add pagerank to sentences for having it in the ouput
+  data <- data.table::setDF(data)
   data <- merge(data,
-                data.frame(textrank_id = names(pr$vector), textrank = as.numeric(pr$vector), stringsAsFactors = FALSE),
+                data.frame(textrank_id = names(pr$vector),
+                           textrank = as.numeric(pr$vector), stringsAsFactors = FALSE),
                 by = "textrank_id", all.x=TRUE, all.y=FALSE, sort = FALSE, suffixes = c("sent.", ""))
 
   result <- list(

diff --git a/man/textrank_sentences.Rd b/man/textrank_sentences.Rd