Skip to content

Commit ff30269

Browse files
authored
CRAN Release v0.1.0 (#35)
* fixes for `R CMD check --as-cran` * Add NEWS.md * `usethis::use_cran_comments()` * Update (aspirational) install instructions in README * Add Daniel, Posit, to Authors list * Add GitHub links to DESCRIPTION * `devtools::build_readme()` * remove duplicate `@examplesIf` * Increment version number to 0.1.0 * submit to CRAN * retrieve example: only if dbplyr installed * Pin `onnxruntime` on Windows * don't run `read_as_markdown()` examples on CRAN * improve `ragnar_retrieve()` example * limit threads in tests on CRAN * skip `ragnar_read()` tests on cran * DESCRIPTION spelling: 'LLMs' -> 'LLM' * Add doi to DESCRIPTION * update examples based on CRAN comments
1 parent 9e644fb commit ff30269

27 files changed

+310
-177
lines changed

.Rbuildignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,13 @@
88
^\.ruff_cache$
99
^\.vscode$
1010
^.*ragnar\.duckdb$
11+
^.*ragnar\.store$
1112
^LICENSE\.md$
1213
^_pkgdown\.yml$
1314
^docs$
1415
^pkgdown$
16+
^.claude$
17+
^.*chat\.Rmd$
18+
^_chats$
19+
^cran-comments\.md$
20+
^CRAN-SUBMISSION$

CRAN-SUBMISSION

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Version: 0.1.0
2+
Date: 2025-05-23 15:15:36 UTC
3+
SHA: 517711973e41ccb26f31d0a908da09b4cde4df06

DESCRIPTION

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,23 @@
11
Package: ragnar
22
Title: Retrieval-Augmented Generation (RAG) Workflows
3-
Version: 0.0.0.9000
4-
Authors@R:
5-
person("Tomasz", "Kalinowski", , "[email protected]", role = c("aut", "cre"))
3+
Version: 0.1.0
4+
Authors@R: c(
5+
person("Tomasz", "Kalinowski", , "[email protected]", role = c("aut", "cre")),
6+
person("Daniel", "Falbel", , "[email protected]", role = "aut"),
7+
person("Posit Software, PBC", role = c("cph", "fnd"),
8+
comment = c(ROR = "03wc8by49"))
9+
)
610
Description: Provides tools for implementing Retrieval-Augmented Generation
7-
(RAG) workflows with Large Language Models (LLMs). Includes functions for
11+
(RAG) workflows with Large Language Models (LLM). Includes functions for
812
document processing, text chunking, embedding generation, storage
913
management, and content retrieval. Supports various document types and
1014
embedding providers ('Ollama', 'OpenAI'), with 'DuckDB' as the default
1115
storage backend. Integrates with the 'ellmer' package to equip chat objects
1216
with retrieval capabilities. Designed to offer both sensible defaults and
1317
customization options with transparent access to intermediate outputs.
18+
For a review of retrieval-augmented generation methods, see Gao et al. (2023)
19+
"Retrieval-Augmented Generation for Large Language Models: A Survey"
20+
<doi:10.48550/arXiv.2312.10997>.
1421
License: MIT + file LICENSE
1522
Encoding: UTF-8
1623
Roxygen: list(markdown = TRUE)
@@ -43,7 +50,6 @@ Suggests:
4350
pandoc,
4451
ellmer,
4552
knitr,
46-
readr,
4753
rmarkdown,
4854
stringr,
4955
dbplyr,
@@ -52,7 +58,6 @@ Suggests:
5258
shiny
5359
Config/Needs/website: tidyverse/tidytemplate, rmarkdown
5460
Config/testthat/edition: 3
55-
Remotes:
56-
rstudio/reticulate
57-
URL: https://tidyverse.github.io/ragnar/
61+
URL: http://ragnar.tidyverse.org/, https://github.com/tidyverse/ragnar
5862
VignetteBuilder: knitr
63+
BugReports: https://github.com/tidyverse/ragnar/issues

NEWS.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# ragnar 0.1.0
2+
3+
* Initial CRAN submission.

R/ellmer.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
#' @export
1010
#'
1111
#' @examplesIf (file.exists("r4ds.ragnar.duckdb") && Sys.getenv("OPENAI_API_KEY") != "")
12-
#' @examplesIf file.exists("r4ds.ragnar.duckdb") && Sys.getenv("OPENAI_API_KEY") != ""
1312
#'
1413
#' system_prompt <- stringr::str_squish("
1514
#' You are an expert assistant in R programming.

R/ragnar-package.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ dotty::.
1818
Sys.setenv(RETICULATE_PYTHON = "managed")
1919
S7::methods_register()
2020
reticulate::py_require(c(
21+
# Pin onnxruntime until this is resolved: https://github.com/microsoft/markitdown/issues/1266
22+
# New VC++ version requirement begins:
23+
# https://github.com/Microsoft/onnxruntime/releases/tag/v1.21.0
24+
if (is_windows()) "onnxruntime<=1.20.1",
2125
"markitdown[all]"
2226
))
2327

R/read-markdown.R

Lines changed: 56 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -9,58 +9,57 @@ init_markitdown <- function(...) {
99

1010
#' Convert files to markdown
1111
#'
12-
#' @param x A filepath or url
13-
#' @inheritParams rlang::args_dots_empty
14-
# ' @param ... Passed on to `MarkItDown.convert()`
12+
#' @param x A filepath or url. Accepts a wide variety of file types, including
13+
#' PDF, PowerPoint, Word, Excel, Images (EXIF metadata and OCR), Audio (EXIF
14+
#' metadata and speech transcription), HTML, Text-based formats (CSV, JSON, XML),
15+
#' ZIP files (iterates over contents), Youtube URLs, and EPubs.#'
16+
#' @param ... Passed on to `MarkItDown.convert()`
1517
#' @param canonical logical, whether to postprocess the output from MarkItDown
1618
#' with `commonmark::markdown_commonmark()`.
1719
#'
1820
#' @returns A single string of markdown
1921
#' @export
2022
#'
21-
#' @examples
23+
#' @examplesIf reticulate::py_available()
2224
#' # convert html
2325
#' read_as_markdown("https://r4ds.hadley.nz/base-R.html") |>
24-
#' substr(1, 1000) |> cat()
26+
#' substr(1, 1000) |>
27+
#' cat()
2528
#'
2629
#' read_as_markdown("https://r4ds.hadley.nz/base-R.html", canonical = TRUE) |>
27-
#' substr(1, 1000) |> cat()
30+
#' substr(1, 1000) |>
31+
#' cat()
2832
#'
2933
#' # convert pdf
3034
#' pdf <- file.path(R.home("doc"), "NEWS.pdf")
3135
#' read_as_markdown(pdf) |> substr(1, 1000) |> cat()
3236
#' ## alternative:
3337
#' # pdftools::pdf_text(pdf) |> substr(1, 2000) |> cat()
3438
#'
35-
#' # convert images
39+
#' # convert images to markdown descriptions using OpenAI
3640
#' jpg <- file.path(R.home("doc"), "html", "logo.jpg")
37-
#' if (FALSE) {
38-
#' # system("brew install ffmpeg")
41+
#' if (Sys.getenv("OPENAI_API_KEY") != "") {
42+
#' # if (xfun::is_macos()) system("brew install ffmpeg")
3943
#' reticulate::py_require("openai")
4044
#' llm_client <- reticulate::import("openai")$OpenAI()
41-
#' read_as_markdown(jpg,
42-
#' llm_client = llm_client,
43-
#' llm_model = "gpt-4o"
44-
#' )
45+
#' read_as_markdown(jpg, llm_client = llm_client, llm_model = "gpt-4.1-mini")
4546
#' # # Description:
46-
#' # The image features the official logo of the R programming language.
47-
#' # Prominently displayed is a bold, blue letter "R," which serves as the
48-
#' # centerpiece of the design. Encircling the "R" is a two-toned,
49-
#' # stylized oval or ellipse with a gradient that transitions from dark
50-
#' # gray to light gray, creating a sense of motion and dynamics. R is an
51-
#' # open-source programming language widely used for statistical
52-
#' # computing, data analysis, and graphical representation. The logo
53-
#' # represents the language's focus on clarity, precision, and
54-
#' # versatility in handling complex data tasks.
47+
#' # The image displays the logo of the R programming language. It features a
48+
#' # large, stylized capital letter "R" in blue, positioned prominently in the
49+
#' # center. Surrounding the "R" is a gray oval shape that is open on the right
50+
#' # side, creating a dynamic and modern appearance. The R logo is commonly
51+
#' # associated with statistical computing, data analysis, and graphical
52+
#' # representation in various scientific and professional fields.
5553
#' }
5654
#'
5755
#' # Alternative approach to image conversion:
58-
#' if(FALSE) {
59-
#' if (Sys.getenv("OPENAI_API_KEY") != "") {
60-
#' rlang::check_installed(c("ellmer", "magick"))
61-
#' chat <- ellmer::chat_openai(echo = TRUE)
62-
#' chat$chat("Describe this image", ellmer::content_image_file(jpg))
63-
#' }
56+
#' if (
57+
#' Sys.getenv("OPENAI_API_KEY") != "" &&
58+
#' rlang::is_installed("ellmer") &&
59+
#' rlang::is_installed("magick")
60+
#' ) {
61+
#' chat <- ellmer::chat_openai(echo = TRUE)
62+
#' chat$chat("Describe this image", ellmer::content_image_file(jpg))
6463
#' }
6564
read_as_markdown <- function(x, ..., canonical = FALSE) {
6665
check_string(x)
@@ -135,9 +134,12 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
135134
tag = elements |> xml_name(),
136135
source_position = elements |> xml_attr("data-sourcepos")
137136
)
138-
if (length(tags)) df <- df[df$tag %in% unique(c(tags)), ]
139137

140-
# common mark returns positions as line:byte-line:byte
138+
if (length(tags)) {
139+
df <- df[df$tag %in% unique(c(tags)), ]
140+
}
141+
142+
# commonmark returns positions as line:byte-line:byte
141143
# e.g., 52:1-52:20
142144
position <- df$source_position |>
143145
stri_split_charclass("[-:]", n = 4L, simplify = TRUE)
@@ -147,13 +149,15 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
147149
line_numbytes <- stri_numbytes(lines) + 1L # +1 for \n
148150
line_startbyte <- c(1L, 1L + drop_last(cumsum(line_numbytes)))
149151

150-
start <- line_startbyte[position[, "start_line"]] +
151-
position[, "start_byte"] -
152-
1L
153-
end <- line_startbyte[position[, "end_line"]] + position[, "end_byte"] - 1L
152+
start <-
153+
line_startbyte[position[, "start_line"]] + position[, "start_byte"] - 1L
154+
end <-
155+
line_startbyte[position[, "end_line"]] + position[, "end_byte"] - 1L
154156

155157
## To convert byte to char index:
156-
# char_byte_indexes <- stri_split_boundaries(text, type = "character")[[1L]] |> stri_numbytes() |> cumsum()
158+
# char_byte_indexes <-
159+
# stri_split_boundaries(text, type = "character")[[1L]] |>
160+
# stri_numbytes() |> cumsum()
157161
# start <- match(start, char_byte_indexes)
158162
# end <- match(end, char_byte_indexes)
159163
tibble::tibble(tag = df$tag, start = start, end = end)
@@ -163,11 +167,13 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
163167
#' Segment markdown text
164168
#'
165169
#' @param text Markdown string
166-
#' @param tags,segment_by A character vector of html tag names, e.g., `c("h1", "h2", "h3", "pre")`
170+
#' @param tags,segment_by A character vector of html tag names, e.g.,
171+
#' `c("h1", "h2", "h3", "pre")`
167172
#' @param trim logical, trim whitespace on segments
168173
#' @param omit_empty logical, whether to remove empty segments
169174
#'
170-
#' @returns A named character vector. Names will correspond to `tags`, or `""` for content inbetween tags.
175+
#' @returns A named character vector. Names will correspond to `tags`, or `""`
176+
#' for content in between tags.
171177
#' @export
172178
#'
173179
#' @examples
@@ -200,9 +206,9 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
200206
#' A table <table>:
201207
#'
202208
#' | Name | Age | City |
203-
#' |-------|----:|----------|
204-
#' | Alice | 25 | New York |
205-
#' | Bob | 30 | London |
209+
#' |-------|----:|-----------|
210+
#' | Alice | 25 | New York |
211+
#' | Bob | 30 | London |
206212
#'
207213
#'
208214
#' ## Conclusion
@@ -211,7 +217,8 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
211217
#'
212218
#' - h1, h2, h3, h4, h5, h6: section headings
213219
#' - p: paragraph (prose)
214-
#' - pre: pre-formatted text, meant to be displayed with monospace font. Typically code or code output
220+
#' - pre: pre-formatted text, meant to be displayed with monospace font.
221+
#' Typically code or code output
215222
#' - blockquote: A blockquote
216223
#' - table: A table
217224
#' - ul: Unordered list
@@ -339,7 +346,7 @@ markdown_segment_text <- function(
339346
#'
340347
#' @export
341348
#'
342-
#' @examples
349+
#' @examplesIf reticulate::py_available()
343350
#' file <- tempfile(fileext = ".html")
344351
#' download.file("https://r4ds.hadley.nz/base-R.html", file, quiet = TRUE)
345352
#'
@@ -462,3 +469,10 @@ cli_markitdown <- function(args, ...) {
462469
...
463470
)
464471
}
472+
473+
474+
should_init_python <- function() {
475+
reticulate::py_available() ||
476+
interactive() ||
477+
identical(Sys.getenv("IN_PKGDOWN"), "true")
478+
}

R/retrieve.R

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -266,35 +266,46 @@ ragnar_retrieve_vss_and_bm25 <- function(store, text, top_k = 3, ...) {
266266
#'
267267
#' @family ragnar_retrieve
268268
#' @export
269-
#' @examples
269+
#' @examplesIf (rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")))
270270
#' # Basic usage
271-
#' mock_embed <- function(x) matrix(stats::runif(10), nrow = length(x), ncol = 10)
272-
#' store <- ragnar_store_create(embed = mock_embed)
271+
#' store <- ragnar_store_create(
272+
#' embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small")
273+
#' )
273274
#' ragnar_store_insert(store, data.frame(text = c("foo", "bar")))
274275
#' ragnar_store_build_index(store)
275276
#' ragnar_retrieve(store, "foo")
276277
#'
277278
#' # More Advanced: store metadata, retrieve with pre-filtering
278279
#' store <- ragnar_store_create(
279-
#' embed = mock_embed,
280+
#' embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small"),
280281
#' extra_cols = data.frame(category = character())
281282
#' )
283+
#'
284+
#' ragnar_store_insert(
285+
#' store,
286+
#' data.frame(
287+
#' category = "desert",
288+
#' text = c("ice cream", "cake", "cookies")
289+
#' )
290+
#' )
291+
#'
282292
#' ragnar_store_insert(
283293
#' store,
284294
#' data.frame(
285-
#' category = c("desert", "desert", "desert", "meal", "meal", "meal"),
286-
#' text = c("ice cream", "cake", "cookies", "pasta", "burger", "salad")
295+
#' category = "meal",
296+
#' text = c("steak", "potatoes", "salad")
287297
#' )
288298
#' )
299+
#'
289300
#' ragnar_store_build_index(store)
290301
#'
291302
#' # simple retrieve
292-
#' ragnar_retrieve(store, "yummy")
303+
#' ragnar_retrieve(store, "carbs")
293304
#'
294305
#' # retrieve with pre-filtering
295306
#' dplyr::tbl(store) |>
296307
#' dplyr::filter(category == "meal") |>
297-
#' ragnar_retrieve("yummy")
308+
#' ragnar_retrieve("carbs")
298309
ragnar_retrieve <- function(store, text, top_k = 3L) {
299310
ragnar_retrieve_vss_and_bm25(store, text, top_k)
300311
}

R/store.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ ragnar_store_build_index <- function(store, type = c("vss", "fts")) {
469469
if ("vss" %in% type && !is.null(store@embed)) {
470470
# TODO: duckdb has support for three different distance metrics that can be
471471
# selected when building the index: l2sq, cosine, and ip. Expose these as options
472-
# in the R interface. https://duckdb.org/docs/extensions/vss.html#usage
472+
# in the R interface. https://duckdb.org/docs/stable/core_extensions/vss#usage
473473
dbExecute(con, "INSTALL vss;")
474474
dbExecute(con, "LOAD vss;")
475475
dbExecute(
@@ -486,7 +486,7 @@ ragnar_store_build_index <- function(store, type = c("vss", "fts")) {
486486
dbExecute(con, "INSTALL fts;")
487487
dbExecute(con, "LOAD fts;")
488488
# fts index builder takes many options, e.g., stemmer, stopwords, etc.
489-
# Expose a way to pass along args. https://duckdb.org/docs/extensions/full_text_search.html
489+
# Expose a way to pass along args. https://duckdb.org/docs/stable/core_extensions/full_text_search
490490
dbExecute(
491491
con,
492492
"PRAGMA create_fts_index('chunks', 'id', 'text', overwrite = 1);"

R/utils.R

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -139,19 +139,5 @@ reorder_names <- function(..., last = NULL) {
139139
x
140140
}
141141

142+
is_windows <- function() identical(.Platform$OS.type, "windows")
142143

143-
ollama_ls <- function() {
144-
rlang::check_installed("readr")
145-
tbl <- system2("ollama", "list", stdout = TRUE)
146-
header <- tbl[1]
147-
col_starts <- stringi::stri_locate_all_words(header)[[1]][, "start"]
148-
col_positions <- readr::fwf_positions(
149-
start = col_starts,
150-
end = c(col_starts[-1L] - 1L, NA),
151-
col_names = stringi::stri_extract_all_words(header)[[1]] |>
152-
stringi::stri_trans_tolower()
153-
)
154-
col_types <- "cccc"
155-
156-
readr::read_fwf(I(tbl[-1]), col_positions, col_types)
157-
}

0 commit comments

Comments
 (0)