Skip to content

Commit 02a2f92

Browse files
committed
Split ?read_file_duckdb
1 parent 8877d0b commit 02a2f92

14 files changed

+355
-247
lines changed

NEWS.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
## Documentation
1818

19-
- Separate `?compute_parquet` and `?compute_csv` (#610, #622).
19+
- Separate `?compute_parquet` and `?compute_csv` (#610, #622), and split `?read_file_duckdb`.
2020

2121
- Italicize book title in README (@wibeasley, #607).
2222

R/io2.R

-158
This file was deleted.

R/read_csv_duckdb.R

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#' Read CSV files using DuckDB
2+
#'
3+
#' @description
4+
#' `read_csv_duckdb()` reads a CSV file using DuckDB's `read_csv_auto()` table function.
5+
#'
6+
#' @inheritParams read_file_duckdb
7+
#' @param options Arguments to the DuckDB `read_csv_auto` table function.
8+
#'
9+
#' @seealso [read_parquet_duckdb()], [read_json_duckdb()]
10+
#'
11+
#' @export
12+
#' @examples
13+
#' # Create simple CSV file
14+
#' path <- tempfile("duckplyr_test_", fileext = ".csv")
15+
#' write.csv(data.frame(a = 1:3, b = letters[4:6]), path, row.names = FALSE)
16+
#'
17+
#' # Reading is immediate
18+
#' df <- read_csv_duckdb(path)
19+
#'
20+
#' # Names are always available
21+
#' names(df)
22+
#'
23+
#' # Materialization upon access is turned off by default
24+
#' try(print(df$a))
25+
#'
26+
#' # Materialize explicitly
27+
#' collect(df)$a
28+
#'
29+
#' # Automatic materialization with prudence = "lavish"
30+
#' df <- read_csv_duckdb(path, prudence = "lavish")
31+
#' df$a
32+
#'
33+
#' # Specify column types
34+
#' read_csv_duckdb(
35+
#' path,
36+
#' options = list(delim = ",", types = list(c("DOUBLE", "VARCHAR")))
37+
#' )
38+
read_csv_duckdb <- function(path, ..., prudence = c("thrifty", "lavish", "stingy"), options = list()) {
39+
check_dots_empty()
40+
41+
read_file_duckdb(path, "read_csv_auto", prudence = prudence, options = options)
42+
}

R/read_file_duckdb.R

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#' Read files using DuckDB
2+
#'
3+
#' @description
4+
#' `read_file_duckdb()` uses arbitrary readers to read data.
5+
#' See <https://duckdb.org/docs/data/overview> for a documentation
6+
#' of the available functions and their options.
7+
#' To read multiple files with the same schema,
8+
#' pass a wildcard or a character vector to the `path` argument,
9+
#'
10+
#' @inheritParams rlang::args_dots_empty
11+
#'
12+
#' @param path Path to files, glob patterns `*` and `?` are supported.
13+
#' @param table_function The name of a table-valued
14+
#' DuckDB function such as `"read_parquet"`,
15+
#' `"read_csv"`, `"read_csv_auto"` or `"read_json"`.
16+
#' @param prudence Memory protection, controls if DuckDB may convert
17+
#' intermediate results in DuckDB-managed memory to data frames in R memory.
18+
#'
19+
#' - `"thrifty"`: up to a maximum size of 1 million cells,
20+
#' - `"lavish"`: regardless of size,
21+
#' - `"stingy"`: never.
22+
#'
23+
#' The default is `"thrifty"` for the ingestion functions,
24+
#' and may be different for other functions.
25+
#' See `vignette("prudence")` for more information.
26+
#'
27+
#' @param options Arguments to the DuckDB function
28+
#' indicated by `table_function`.
29+
#'
30+
#' @inheritSection duckdb_tibble Fine-tuning prudence
31+
#'
32+
#' @return A duckplyr frame, see [as_duckdb_tibble()] for details.
33+
#'
34+
#' @seealso [read_csv_duckdb()], [read_parquet_duckdb()], [read_json_duckdb()]
35+
#'
36+
#' @rdname read_file_duckdb
37+
#' @export
38+
read_file_duckdb <- function(
39+
path,
40+
table_function,
41+
...,
42+
prudence = c("thrifty", "lavish", "stingy"),
43+
options = list()
44+
) {
45+
check_dots_empty()
46+
47+
if (!rlang::is_character(path)) {
48+
cli::cli_abort("{.arg path} must be a character vector.")
49+
}
50+
51+
if (length(path) != 1) {
52+
path <- list(path)
53+
}
54+
55+
duckfun(table_function, c(list(path), options), prudence = prudence)
56+
}
57+
58+
duckfun <- function(table_function, args, ..., prudence) {
59+
if (!is.list(args)) {
60+
cli::cli_abort("{.arg args} must be a list.")
61+
}
62+
if (length(args) == 0) {
63+
cli::cli_abort("{.arg args} must not be empty.")
64+
}
65+
66+
# FIXME: For some reason, it's important to create an alias here
67+
con <- get_default_duckdb_connection()
68+
69+
# FIXME: Provide better duckdb API
70+
path <- args[[1]]
71+
options <- args[-1]
72+
73+
rel <- duckdb$rel_from_table_function(
74+
con,
75+
table_function,
76+
list(path),
77+
options
78+
)
79+
80+
meta_rel_register_file(rel, table_function, path, options)
81+
82+
rel_to_df(rel, prudence = prudence)
83+
}

R/read_json_duckdb.R

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#' Read JSON files using DuckDB
2+
#'
3+
#' @description
4+
#' `read_json_duckdb()` reads a JSON file using DuckDB's `read_json()` table function.
5+
#'
6+
#' @inheritParams read_file_duckdb
7+
#' @param options Arguments to the DuckDB `read_json` table function.
8+
#'
9+
#' @seealso [read_csv_duckdb()], [read_parquet_duckdb()]
10+
#'
11+
#' @export
12+
#' @examplesIf identical(Sys.getenv("IN_PKGDOWN"), "TRUE")
13+
#'
14+
#' # Create and read a simple JSON file
15+
#' path <- tempfile("duckplyr_test_", fileext = ".json")
16+
#' writeLines('[{"a": 1, "b": "x"}, {"a": 2, "b": "y"}]', path)
17+
#'
18+
#' # Reading needs the json extension
19+
#' db_exec("INSTALL json")
20+
#' db_exec("LOAD json")
21+
#' read_json_duckdb(path)
22+
read_json_duckdb <- function(path, ..., prudence = c("thrifty", "lavish", "stingy"), options = list()) {
23+
check_dots_empty()
24+
25+
read_file_duckdb(path, "read_json", prudence = prudence, options = options)
26+
}

R/read_parquet_duckdb.R

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#' Read Parquet files using DuckDB
2+
#'
3+
#' @description
4+
#' `read_parquet_duckdb()` reads a Parquet file using DuckDB's `read_parquet()` table function.
5+
#'
6+
#' @inheritParams read_file_duckdb
7+
#' @param options Arguments to the DuckDB `read_parquet` table function.
8+
#'
9+
#' @seealso [read_csv_duckdb()], [read_json_duckdb()]
10+
#'
11+
#' @export
12+
read_parquet_duckdb <- function(path, ..., prudence = c("thrifty", "lavish", "stingy"), options = list()) {
13+
check_dots_empty()
14+
15+
read_file_duckdb(path, "read_parquet", prudence = prudence, options = options)
16+
}

_pkgdown.yml

+3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ reference:
2727
- title: Using duckplyr
2828
contents:
2929
- duckdb_tibble
30+
- read_parquet_duckdb
31+
- read_csv_duckdb
32+
- read_json_duckdb
3033
- read_file_duckdb
3134
- read_sql_duckdb
3235

0 commit comments

Comments
 (0)