@@ -9,58 +9,57 @@ init_markitdown <- function(...) {
9
9
10
10
# ' Convert files to markdown
11
11
# '
12
- # ' @param x A filepath or url
13
- # ' @inheritParams rlang::args_dots_empty
14
- # ' @param ... Passed on to `MarkItDown.convert()`
12
+ # ' @param x A filepath or url. Accepts a wide variety of file types, including
13
+ # ' PDF, PowerPoint, Word, Excel, Images (EXIF metadata and OCR), Audio (EXIF
14
+ # ' metadata and speech transcription), HTML, Text-based formats (CSV, JSON, XML),
15
+ # ' ZIP files (iterates over contents), Youtube URLs, and EPubs.#'
16
+ # ' @param ... Passed on to `MarkItDown.convert()`
15
17
# ' @param canonical logical, whether to postprocess the output from MarkItDown
16
18
# ' with `commonmark::markdown_commonmark()`.
17
19
# '
18
20
# ' @returns A single string of markdown
19
21
# ' @export
20
22
# '
21
- # ' @examples
23
+ # ' @examplesIf reticulate::py_available()
22
24
# ' # convert html
23
25
# ' read_as_markdown("https://r4ds.hadley.nz/base-R.html") |>
24
- # ' substr(1, 1000) |> cat()
26
+ # ' substr(1, 1000) |>
27
+ # ' cat()
25
28
# '
26
29
# ' read_as_markdown("https://r4ds.hadley.nz/base-R.html", canonical = TRUE) |>
27
- # ' substr(1, 1000) |> cat()
30
+ # ' substr(1, 1000) |>
31
+ # ' cat()
28
32
# '
29
33
# ' # convert pdf
30
34
# ' pdf <- file.path(R.home("doc"), "NEWS.pdf")
31
35
# ' read_as_markdown(pdf) |> substr(1, 1000) |> cat()
32
36
# ' ## alternative:
33
37
# ' # pdftools::pdf_text(pdf) |> substr(1, 2000) |> cat()
34
38
# '
35
- # ' # convert images
39
+ # ' # convert images to markdown descriptions using OpenAI
36
40
# ' jpg <- file.path(R.home("doc"), "html", "logo.jpg")
37
- # ' if (FALSE ) {
38
- # ' # system("brew install ffmpeg")
41
+ # ' if (Sys.getenv("OPENAI_API_KEY") != "" ) {
42
+ # ' # if (xfun::is_macos()) system("brew install ffmpeg")
39
43
# ' reticulate::py_require("openai")
40
44
# ' llm_client <- reticulate::import("openai")$OpenAI()
41
- # ' read_as_markdown(jpg,
42
- # ' llm_client = llm_client,
43
- # ' llm_model = "gpt-4o"
44
- # ' )
45
+ # ' read_as_markdown(jpg, llm_client = llm_client, llm_model = "gpt-4.1-mini")
45
46
# ' # # Description:
46
- # ' # The image features the official logo of the R programming language.
47
- # ' # Prominently displayed is a bold, blue letter "R," which serves as the
48
- # ' # centerpiece of the design. Encircling the "R" is a two-toned,
49
- # ' # stylized oval or ellipse with a gradient that transitions from dark
50
- # ' # gray to light gray, creating a sense of motion and dynamics. R is an
51
- # ' # open-source programming language widely used for statistical
52
- # ' # computing, data analysis, and graphical representation. The logo
53
- # ' # represents the language's focus on clarity, precision, and
54
- # ' # versatility in handling complex data tasks.
47
+ # ' # The image displays the logo of the R programming language. It features a
48
+ # ' # large, stylized capital letter "R" in blue, positioned prominently in the
49
+ # ' # center. Surrounding the "R" is a gray oval shape that is open on the right
50
+ # ' # side, creating a dynamic and modern appearance. The R logo is commonly
51
+ # ' # associated with statistical computing, data analysis, and graphical
52
+ # ' # representation in various scientific and professional fields.
55
53
# ' }
56
54
# '
57
55
# ' # Alternative approach to image conversion:
58
- # ' if(FALSE) {
59
- # ' if (Sys.getenv("OPENAI_API_KEY") != "") {
60
- # ' rlang::check_installed(c("ellmer", "magick"))
61
- # ' chat <- ellmer::chat_openai(echo = TRUE)
62
- # ' chat$chat("Describe this image", ellmer::content_image_file(jpg))
63
- # ' }
56
+ # ' if (
57
+ # ' Sys.getenv("OPENAI_API_KEY") != "" &&
58
+ # ' rlang::is_installed("ellmer") &&
59
+ # ' rlang::is_installed("magick")
60
+ # ' ) {
61
+ # ' chat <- ellmer::chat_openai(echo = TRUE)
62
+ # ' chat$chat("Describe this image", ellmer::content_image_file(jpg))
64
63
# ' }
65
64
read_as_markdown <- function (x , ... , canonical = FALSE ) {
66
65
check_string(x )
@@ -135,9 +134,12 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
135
134
tag = elements | > xml_name(),
136
135
source_position = elements | > xml_attr(" data-sourcepos" )
137
136
)
138
- if (length(tags )) df <- df [df $ tag %in% unique(c(tags )), ]
139
137
140
- # common mark returns positions as line:byte-line:byte
138
+ if (length(tags )) {
139
+ df <- df [df $ tag %in% unique(c(tags )), ]
140
+ }
141
+
142
+ # commonmark returns positions as line:byte-line:byte
141
143
# e.g., 52:1-52:20
142
144
position <- df $ source_position | >
143
145
stri_split_charclass(" [-:]" , n = 4L , simplify = TRUE )
@@ -147,13 +149,15 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
147
149
line_numbytes <- stri_numbytes(lines ) + 1L # +1 for \n
148
150
line_startbyte <- c(1L , 1L + drop_last(cumsum(line_numbytes )))
149
151
150
- start <- line_startbyte [ position [, " start_line " ]] +
151
- position [, " start_byte" ] -
152
- 1L
153
- end <- line_startbyte [position [, " end_line" ]] + position [, " end_byte" ] - 1L
152
+ start <-
153
+ line_startbyte [ position [, " start_line " ]] + position [, " start_byte" ] - 1L
154
+ end <-
155
+ line_startbyte [position [, " end_line" ]] + position [, " end_byte" ] - 1L
154
156
155
157
# # To convert byte to char index:
156
- # char_byte_indexes <- stri_split_boundaries(text, type = "character")[[1L]] |> stri_numbytes() |> cumsum()
158
+ # char_byte_indexes <-
159
+ # stri_split_boundaries(text, type = "character")[[1L]] |>
160
+ # stri_numbytes() |> cumsum()
157
161
# start <- match(start, char_byte_indexes)
158
162
# end <- match(end, char_byte_indexes)
159
163
tibble :: tibble(tag = df $ tag , start = start , end = end )
@@ -163,11 +167,13 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
163
167
# ' Segment markdown text
164
168
# '
165
169
# ' @param text Markdown string
166
- # ' @param tags,segment_by A character vector of html tag names, e.g., `c("h1", "h2", "h3", "pre")`
170
+ # ' @param tags,segment_by A character vector of html tag names, e.g.,
171
+ # ' `c("h1", "h2", "h3", "pre")`
167
172
# ' @param trim logical, trim whitespace on segments
168
173
# ' @param omit_empty logical, whether to remove empty segments
169
174
# '
170
- # ' @returns A named character vector. Names will correspond to `tags`, or `""` for content inbetween tags.
175
+ # ' @returns A named character vector. Names will correspond to `tags`, or `""`
176
+ # ' for content in between tags.
171
177
# ' @export
172
178
# '
173
179
# ' @examples
@@ -200,9 +206,9 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
200
206
# ' A table <table>:
201
207
# '
202
208
# ' | Name | Age | City |
203
- # ' |-------|----:|----------|
204
- # ' | Alice | 25 | New York |
205
- # ' | Bob | 30 | London |
209
+ # ' |-------|----:|----------- |
210
+ # ' | Alice | 25 | New York |
211
+ # ' | Bob | 30 | London |
206
212
# '
207
213
# '
208
214
# ' ## Conclusion
@@ -211,7 +217,8 @@ markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
211
217
# '
212
218
# ' - h1, h2, h3, h4, h5, h6: section headings
213
219
# ' - p: paragraph (prose)
214
- # ' - pre: pre-formatted text, meant to be displayed with monospace font. Typically code or code output
220
+ # ' - pre: pre-formatted text, meant to be displayed with monospace font.
221
+ # ' Typically code or code output
215
222
# ' - blockquote: A blockquote
216
223
# ' - table: A table
217
224
# ' - ul: Unordered list
@@ -339,7 +346,7 @@ markdown_segment_text <- function(
339
346
# '
340
347
# ' @export
341
348
# '
342
- # ' @examples
349
+ # ' @examplesIf reticulate::py_available()
343
350
# ' file <- tempfile(fileext = ".html")
344
351
# ' download.file("https://r4ds.hadley.nz/base-R.html", file, quiet = TRUE)
345
352
# '
@@ -462,3 +469,10 @@ cli_markitdown <- function(args, ...) {
462
469
...
463
470
)
464
471
}
472
+
473
+
474
+ should_init_python <- function () {
475
+ reticulate :: py_available() ||
476
+ interactive() ||
477
+ identical(Sys.getenv(" IN_PKGDOWN" ), " true" )
478
+ }
0 commit comments