tidyverse
diff --git a/‎.Rbuildignore‎
Lines changed: 6 additions & 0 deletions b/‎.Rbuildignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎CRAN-SUBMISSION‎
Lines changed: 3 additions & 0 deletions b/‎CRAN-SUBMISSION‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 13 additions & 8 deletions b/‎DESCRIPTION‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎NEWS.md‎
Lines changed: 3 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎R/ellmer.R‎
Lines changed: 0 additions & 1 deletion b/‎R/ellmer.R‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎R/ragnar-package.R‎
Lines changed: 4 additions & 0 deletions b/‎R/ragnar-package.R‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎R/read-markdown.R‎
Lines changed: 56 additions & 42 deletions b/‎R/read-markdown.R‎
Lines changed: 56 additions & 42 deletions
diff --git a/‎R/retrieve.R‎
Lines changed: 19 additions & 8 deletions b/‎R/retrieve.R‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎R/store.R‎
Lines changed: 2 additions & 2 deletions b/‎R/store.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/utils.R‎
Lines changed: 1 addition & 15 deletions b/‎R/utils.R‎
Lines changed: 1 addition & 15 deletions
@@ -8,7 +8,13 @@
 ^\.ruff_cache$
 ^\.vscode$
 ^.*ragnar\.duckdb$
+^.*ragnar\.store$
 ^LICENSE\.md$
 ^_pkgdown\.yml$
 ^docs$
 ^pkgdown$
+^.claude$
+^.*chat\.Rmd$
+^_chats$
+^cran-comments\.md$
+^CRAN-SUBMISSION$
@@ -0,0 +1,3 @@
+Version: 0.1.0
+Date: 2025-05-23 15:15:36 UTC
+SHA: 517711973e41ccb26f31d0a908da09b4cde4df06
@@ -1,16 +1,23 @@
 Package: ragnar
 Title: Retrieval-Augmented Generation (RAG) Workflows
-Version: 0.0.0.9000
-Authors@R:
-    person("Tomasz", "Kalinowski", , "[email protected]", role = c("aut", "cre"))
+Version: 0.1.0
+Authors@R: c(
+    person("Tomasz", "Kalinowski", , "[email protected]", role = c("aut", "cre")),
+    person("Daniel", "Falbel", , "[email protected]", role = "aut"),
+    person("Posit Software, PBC", role = c("cph", "fnd"),
+           comment = c(ROR = "03wc8by49"))
+  )
 Description: Provides tools for implementing Retrieval-Augmented Generation
-    (RAG) workflows with Large Language Models (LLMs). Includes functions for
+    (RAG) workflows with Large Language Models (LLM). Includes functions for
     document processing, text chunking, embedding generation, storage
     management, and content retrieval. Supports various document types and
     embedding providers ('Ollama', 'OpenAI'), with 'DuckDB' as the default
     storage backend. Integrates with the 'ellmer' package to equip chat objects
     with retrieval capabilities. Designed to offer both sensible defaults and
     customization options with transparent access to intermediate outputs.
+    For a review of retrieval-augmented generation methods, see Gao et al. (2023)
+    "Retrieval-Augmented Generation for Large Language Models: A Survey"
+    <doi:10.48550/arXiv.2312.10997>.
 License: MIT + file LICENSE
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
@@ -43,7 +50,6 @@ Suggests:
     pandoc,
     ellmer,
     knitr,
-    readr,
     rmarkdown,
     stringr,
     dbplyr,
@@ -52,7 +58,6 @@ Suggests:
     shiny
 Config/Needs/website: tidyverse/tidytemplate, rmarkdown
 Config/testthat/edition: 3
-Remotes:
-    rstudio/reticulate
-URL: https://tidyverse.github.io/ragnar/
+URL: http://ragnar.tidyverse.org/, https://github.com/tidyverse/ragnar
 VignetteBuilder: knitr
+BugReports: https://github.com/tidyverse/ragnar/issues
@@ -0,0 +1,3 @@
+# ragnar 0.1.0
+
+* Initial CRAN submission.
@@ -9,7 +9,6 @@
 #' @export
 #'
 #' @examplesIf (file.exists("r4ds.ragnar.duckdb") && Sys.getenv("OPENAI_API_KEY") != "")
-#' @examplesIf file.exists("r4ds.ragnar.duckdb") && Sys.getenv("OPENAI_API_KEY") != ""
 #'
 #' system_prompt <- stringr::str_squish("
 #'   You are an expert assistant in R programming.
 
@@ -18,6 +18,10 @@ dotty::.
   Sys.setenv(RETICULATE_PYTHON = "managed")
   S7::methods_register()
   reticulate::py_require(c(
+    # Pin onnxruntime until this is resolved: https://github.com/microsoft/markitdown/issues/1266
+    # New VC++ version requirement begins:
+    # https://github.com/Microsoft/onnxruntime/releases/tag/v1.21.0
+    if (is_windows()) "onnxruntime<=1.20.1",
     "markitdown[all]"
   ))
 
 
@@ -9,58 +9,57 @@ init_markitdown <- function(...) {
 
 #' Convert files to markdown
 #'
-#' @param x A filepath or url
-#' @inheritParams rlang::args_dots_empty
-# ' @param ... Passed on to `MarkItDown.convert()`
+#' @param x A filepath or url. Accepts a wide variety of file types, including
+#'   PDF, PowerPoint, Word, Excel, Images (EXIF metadata and OCR), Audio (EXIF
+#'   metadata and speech transcription), HTML, Text-based formats (CSV, JSON, XML),
+#'   ZIP files (iterates over contents), Youtube URLs, and EPubs.#'
+#' @param ... Passed on to `MarkItDown.convert()`
 #' @param canonical logical, whether to postprocess the output from MarkItDown
 #'   with `commonmark::markdown_commonmark()`.
 #'
 #' @returns A single string of markdown
 #' @export
 #'
-#' @examples
+#' @examplesIf reticulate::py_available()
 #' # convert html
 #' read_as_markdown("https://r4ds.hadley.nz/base-R.html") |>
-#'   substr(1, 1000) |> cat()
+#'   substr(1, 1000) |>
+#'   cat()
 #'
 #' read_as_markdown("https://r4ds.hadley.nz/base-R.html", canonical = TRUE) |>
-#'   substr(1, 1000) |> cat()
+#'   substr(1, 1000) |>
+#'   cat()
 #'
 #' # convert pdf
 #' pdf <- file.path(R.home("doc"), "NEWS.pdf")
 #' read_as_markdown(pdf) |> substr(1, 1000) |> cat()
 #' ## alternative:
 #' # pdftools::pdf_text(pdf) |> substr(1, 2000) |> cat()
 #'
-#' # convert images
+#' # convert images to markdown descriptions using OpenAI
 #' jpg <- file.path(R.home("doc"), "html", "logo.jpg")
-#' if (FALSE) {
-#'   # system("brew install ffmpeg")
+#' if (Sys.getenv("OPENAI_API_KEY") != "") {
+#'   # if (xfun::is_macos()) system("brew install ffmpeg")
 #'   reticulate::py_require("openai")
 #'   llm_client <- reticulate::import("openai")$OpenAI()
-#'   read_as_markdown(jpg,
-#'     llm_client = llm_client,
-#'     llm_model = "gpt-4o"
-#'   )
+#'   read_as_markdown(jpg, llm_client = llm_client, llm_model = "gpt-4.1-mini")
 #'   # # Description:
-#'   # The image features the official logo of the R programming language.
-#'   # Prominently displayed is a bold, blue letter "R," which serves as the
-#'   # centerpiece of the design. Encircling the "R" is a two-toned,
-#'   # stylized oval or ellipse with a gradient that transitions from dark
-#'   # gray to light gray, creating a sense of motion and dynamics. R is an
-#'   # open-source programming language widely used for statistical
-#'   # computing, data analysis, and graphical representation. The logo
-#'   # represents the language's focus on clarity, precision, and
-#'   # versatility in handling complex data tasks.
+#'   # The image displays the logo of the R programming language. It features a
+#'   # large, stylized capital letter "R" in blue, positioned prominently in the
+#'   # center. Surrounding the "R" is a gray oval shape that is open on the right
+#'   # side, creating a dynamic and modern appearance. The R logo is commonly
+#'   # associated with statistical computing, data analysis, and graphical
+#'   # representation in various scientific and professional fields.
 #' }
 #'
 #' # Alternative approach to image conversion:
-#' if(FALSE) {
-#'   if (Sys.getenv("OPENAI_API_KEY") != "") {
-#'     rlang::check_installed(c("ellmer", "magick"))
-#'     chat <- ellmer::chat_openai(echo = TRUE)
-#'     chat$chat("Describe this image", ellmer::content_image_file(jpg))
-#'   }
+#' if (
+#'   Sys.getenv("OPENAI_API_KEY") != "" &&
+#'     rlang::is_installed("ellmer") &&
+#'     rlang::is_installed("magick")
+#' ) {
+#'   chat <- ellmer::chat_openai(echo = TRUE)
+#'   chat$chat("Describe this image", ellmer::content_image_file(jpg))
 #' }
 read_as_markdown <- function(x, ..., canonical = FALSE) {
   check_string(x)
@@ -135,9 +134,12 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
     tag = elements |> xml_name(),
     source_position = elements |> xml_attr("data-sourcepos")
   )
-  if (length(tags)) df <- df[df$tag %in% unique(c(tags)), ]
 
-  # common mark returns positions as line:byte-line:byte
+  if (length(tags)) {
+    df <- df[df$tag %in% unique(c(tags)), ]
+  }
+
+  # commonmark returns positions as line:byte-line:byte
   # e.g., 52:1-52:20
   position <- df$source_position |>
     stri_split_charclass("[-:]", n = 4L, simplify = TRUE)
@@ -147,13 +149,15 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
   line_numbytes <- stri_numbytes(lines) + 1L # +1 for \n
   line_startbyte <- c(1L, 1L + drop_last(cumsum(line_numbytes)))
 
-  start <- line_startbyte[position[, "start_line"]] +
-    position[, "start_byte"] -
-    1L
-  end <- line_startbyte[position[, "end_line"]] + position[, "end_byte"] - 1L
+  start <-
+    line_startbyte[position[, "start_line"]] + position[, "start_byte"] - 1L
+  end <-
+    line_startbyte[position[, "end_line"]] + position[, "end_byte"] - 1L
 
   ## To convert byte to char index:
-  # char_byte_indexes <- stri_split_boundaries(text, type = "character")[[1L]] |> stri_numbytes() |> cumsum()
+  # char_byte_indexes <-
+  #   stri_split_boundaries(text, type = "character")[[1L]] |>
+  #   stri_numbytes() |> cumsum()
   # start <- match(start, char_byte_indexes)
   # end <- match(end, char_byte_indexes)
   tibble::tibble(tag = df$tag, start = start, end = end)
@@ -163,11 +167,13 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
 #' Segment markdown text
 #'
 #' @param text Markdown string
-#' @param tags,segment_by A character vector of html tag names, e.g., `c("h1", "h2", "h3", "pre")`
+#' @param tags,segment_by A character vector of html tag names, e.g.,
+#'   `c("h1", "h2", "h3", "pre")`
 #' @param trim logical, trim whitespace on segments
 #' @param omit_empty logical, whether to remove empty segments
 #'
-#' @returns A named character vector. Names will correspond to `tags`, or `""` for content inbetween tags.
+#' @returns A named character vector. Names will correspond to `tags`, or `""`
+#'   for content in between tags.
 #' @export
 #'
 #' @examples
@@ -200,9 +206,9 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
 #' A table <table>:
 #'
 #'   | Name  | Age | City      |
-#'   |-------|----:|----------|
-#'   | Alice |  25 | New York |
-#'   | Bob   |  30 | London   |
+#'   |-------|----:|-----------|
+#'   | Alice |  25 | New York  |
+#'   | Bob   |  30 | London    |
 #'
 #'
 #' ## Conclusion
@@ -211,7 +217,8 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
 #'
 #' - h1, h2, h3, h4, h5, h6: section headings
 #' - p: paragraph (prose)
-#' - pre: pre-formatted text, meant to be displayed with monospace font. Typically code or code output
+#' - pre: pre-formatted text, meant to be displayed with monospace font.
+#'   Typically code or code output
 #' - blockquote: A blockquote
 #' - table: A table
 #' - ul: Unordered list
@@ -339,7 +346,7 @@ markdown_segment_text <- function(
 #'
 #' @export
 #'
-#' @examples
+#' @examplesIf reticulate::py_available()
 #' file <- tempfile(fileext = ".html")
 #' download.file("https://r4ds.hadley.nz/base-R.html", file, quiet = TRUE)
 #'
@@ -462,3 +469,10 @@ cli_markitdown <- function(args, ...) {
     ...
   )
 }
+
+
+should_init_python <- function() {
+  reticulate::py_available() ||
+    interactive() ||
+    identical(Sys.getenv("IN_PKGDOWN"), "true")
+}
@@ -266,35 +266,46 @@ ragnar_retrieve_vss_and_bm25 <- function(store, text, top_k = 3, ...) {
 #'
 #' @family ragnar_retrieve
 #' @export
-#' @examples
+#' @examplesIf (rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")))
 #' # Basic usage
-#' mock_embed <- function(x) matrix(stats::runif(10), nrow = length(x), ncol = 10)
-#' store <- ragnar_store_create(embed = mock_embed)
+#' store <- ragnar_store_create(
+#'   embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small")
+#' )
 #' ragnar_store_insert(store, data.frame(text = c("foo", "bar")))
 #' ragnar_store_build_index(store)
 #' ragnar_retrieve(store, "foo")
 #'
 #' # More Advanced: store metadata, retrieve with pre-filtering
 #' store <- ragnar_store_create(
-#'   embed = mock_embed,
+#'   embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small"),
 #'   extra_cols = data.frame(category = character())
 #' )
+#'
+#' ragnar_store_insert(
+#'   store,
+#'   data.frame(
+#'     category = "desert",
+#'     text = c("ice cream", "cake", "cookies")
+#'   )
+#' )
+#'
 #' ragnar_store_insert(
 #'   store,
 #'   data.frame(
-#'     category = c("desert", "desert", "desert", "meal", "meal", "meal"),
-#'     text = c("ice cream", "cake", "cookies", "pasta", "burger", "salad")
+#'     category = "meal",
+#'     text = c("steak", "potatoes", "salad")
 #'   )
 #' )
+#'
 #' ragnar_store_build_index(store)
 #'
 #' # simple retrieve
-#' ragnar_retrieve(store, "yummy")
+#' ragnar_retrieve(store, "carbs")
 #'
 #' # retrieve with pre-filtering
 #' dplyr::tbl(store) |>
 #'   dplyr::filter(category == "meal") |>
-#'   ragnar_retrieve("yummy")
+#'   ragnar_retrieve("carbs")
 ragnar_retrieve <- function(store, text, top_k = 3L) {
   ragnar_retrieve_vss_and_bm25(store, text, top_k)
 }
@@ -469,7 +469,7 @@ ragnar_store_build_index <- function(store, type = c("vss", "fts")) {
   if ("vss" %in% type && !is.null(store@embed)) {
     # TODO: duckdb has support for three different distance metrics that can be
     # selected when building the index: l2sq, cosine, and ip. Expose these as options
-    # in the R interface. https://duckdb.org/docs/extensions/vss.html#usage
+    # in the R interface. https://duckdb.org/docs/stable/core_extensions/vss#usage
     dbExecute(con, "INSTALL vss;")
     dbExecute(con, "LOAD vss;")
     dbExecute(
@@ -486,7 +486,7 @@ ragnar_store_build_index <- function(store, type = c("vss", "fts")) {
     dbExecute(con, "INSTALL fts;")
     dbExecute(con, "LOAD fts;")
     # fts index builder takes many options, e.g., stemmer, stopwords, etc.
-    # Expose a way to pass along args. https://duckdb.org/docs/extensions/full_text_search.html
+    # Expose a way to pass along args. https://duckdb.org/docs/stable/core_extensions/full_text_search
     dbExecute(
       con,
       "PRAGMA create_fts_index('chunks', 'id', 'text', overwrite = 1);"
 
@@ -139,19 +139,5 @@ reorder_names <- function(..., last = NULL) {
   x
 }
 
+is_windows <- function() identical(.Platform$OS.type, "windows")
 
-ollama_ls <- function() {
-  rlang::check_installed("readr")
-  tbl <- system2("ollama", "list", stdout = TRUE)
-  header <- tbl[1]
-  col_starts <- stringi::stri_locate_all_words(header)[[1]][, "start"]
-  col_positions <- readr::fwf_positions(
-    start = col_starts,
-    end = c(col_starts[-1L] - 1L, NA),
-    col_names = stringi::stri_extract_all_words(header)[[1]] |>
-      stringi::stri_trans_tolower()
-  )
-  col_types <- "cccc"
-
-  readr::read_fwf(I(tbl[-1]), col_positions, col_types)
-}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Version: 0.1.0`
	`2`	`+Date: 2025-05-23 15:15:36 UTC`
	`3`	`+SHA: 517711973e41ccb26f31d0a908da09b4cde4df06`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# ragnar 0.1.0`
	`2`	`+`
	`3`	`+* Initial CRAN submission.`