Skip to content

Commit f9dfd77

Browse files
authored
Add additional extension URI to Arrow Substrait Compiler (#208)
1 parent 2bff904 commit f9dfd77

14 files changed

+160
-50
lines changed

.github/workflows/R-CMD-check.yaml

+40-32
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ jobs:
4040

4141
steps:
4242
- uses: actions/checkout@v3
43+
with:
44+
path: substrait
4345

4446
- uses: r-lib/actions/setup-pandoc@v2
4547

@@ -56,39 +58,45 @@ jobs:
5658
uses: r-lib/actions/setup-r-dependencies@v2
5759
with:
5860
extra-packages: rcmdcheck
61+
working-directory: /home/runner/work/substrait-r/substrait-r/substrait/
62+
63+
- name: Install DuckDB
64+
run: |
65+
R -e "remotes::install_github('cran/[email protected]')"
66+
67+
- name: Checkout Arrow repo
68+
uses: actions/checkout@v3
69+
with:
70+
repository: apache/arrow
71+
path: arrow
72+
73+
- name: Install Arrow with ARROW_SUBSTRAIT turned on
74+
run: |
75+
mkdir install_dir
76+
cd /home/runner/work/substrait-r/substrait-r/arrow/cpp
77+
mkdir build_dir
78+
cd build_dir
79+
80+
cmake -DCMAKE_INSTALL_PREFIX=${ARROW_HOME} \
81+
-DCMAKE_INSTALL_LIBDIR=lib -DCMAKE_BUILD_TYPE=Debug -DARROW_COMPUTE=ON -DARROW_CSV=ON -DARROW_DATASET=OFF \
82+
-DARROW_FILESYSTEM=ON -DARROW_JEMALLOC=OFF -DARROW_JSON=OFF -DARROW_PARQUET=ON -DARROW_WITH_SNAPPY=OFF \
83+
-DARROW_WITH_ZLIB=OFF -DARROW_INSTALL_NAME_RPATH=OFF -DARROW_EXTRA_ERROR_CONTEXT=ON \
84+
-DARROW_INSTALL_NAME_RPATH=OFF -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_SUBSTRAIT=ON ..
5985
60-
# Not using arrow package until https://github.com/apache/arrow/pull/13914 merges
61-
# - name: Checkout Arrow repo
62-
# uses: actions/checkout@v3
63-
# with:
64-
# repository: apache/arrow
65-
# path: arrow
66-
#
67-
# - name: Install Arrow with ARROW_SUBSTRAIT turned on
68-
# run: |
69-
# mkdir install_dir
70-
# cd /home/runner/work/substrait-r/substrait-r/arrow/cpp
71-
# mkdir build_dir
72-
# cd build_dir
73-
#
74-
# cmake -DCMAKE_INSTALL_PREFIX=${ARROW_HOME} \
75-
# -DCMAKE_INSTALL_LIBDIR=lib -DCMAKE_BUILD_TYPE=Debug -DARROW_COMPUTE=ON -DARROW_CSV=ON -DARROW_DATASET=OFF \
76-
# -DARROW_FILESYSTEM=ON -DARROW_JEMALLOC=OFF -DARROW_JSON=OFF -DARROW_PARQUET=ON -DARROW_WITH_SNAPPY=OFF \
77-
# -DARROW_WITH_ZLIB=OFF -DARROW_INSTALL_NAME_RPATH=OFF -DARROW_EXTRA_ERROR_CONTEXT=ON \
78-
# -DARROW_INSTALL_NAME_RPATH=OFF -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_SUBSTRAIT=ON ..
79-
#
80-
# sudo make -j2 install
81-
#
82-
# - name: Setup arrow dependencies
83-
# uses: r-lib/actions/setup-r-dependencies@v2
84-
# with:
85-
# working-directory: /home/runner/work/substrait-r/substrait-r/arrow/r/
86-
#
87-
# - name: Install Arrow R package
88-
# run: |
89-
# cd /home/runner/work/substrait-r/substrait-r/arrow/r/
90-
# make clean
91-
# R CMD INSTALL .
86+
sudo make -j2 install
87+
88+
- name: Setup arrow dependencies
89+
uses: r-lib/actions/setup-r-dependencies@v2
90+
with:
91+
working-directory: /home/runner/work/substrait-r/substrait-r/arrow/r/
92+
93+
- name: Install Arrow R package
94+
run: |
95+
cd /home/runner/work/substrait-r/substrait-r/arrow/r/
96+
make clean
97+
R CMD INSTALL .
9298
9399
- name: Run R CMD check
94100
uses: r-lib/actions/check-r-package@v2
101+
with:
102+
working-directory: /home/runner/work/substrait-r/substrait-r/substrait/

R/compiler.R

+14-6
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ SubstraitCompiler <- R6::R6Class(
6262
initialize = function(object = NULL, ...) {
6363
self$.fns <- list()
6464

65-
private$extension_uri <- substrait$extensions$SimpleExtensionURI$create(
66-
extension_uri_anchor = 1L
65+
private$extension_uri <- list(
66+
substrait$extensions$SimpleExtensionURI$create(extension_uri_anchor = 1L)
6767
)
6868

6969
# these are key/value stores but for at least function_extensions
@@ -189,9 +189,7 @@ SubstraitCompiler <- R6::R6Class(
189189
)
190190
)
191191
),
192-
extension_uris = list(
193-
private$extension_uri
194-
),
192+
extension_uris = private$extension_uri,
195193
extensions = c(
196194
lapply(
197195
unname(private$function_extensions),
@@ -271,7 +269,7 @@ SubstraitCompiler <- R6::R6Class(
271269
extensions$
272270
SimpleExtensionDeclaration$
273271
ExtensionFunction$create(
274-
extension_uri_reference = private$extension_uri$extension_uri_anchor,
272+
extension_uri_reference = self$extension_uri_anchor(name),
275273
function_anchor = self$next_id(),
276274
name = name
277275
)
@@ -283,6 +281,16 @@ SubstraitCompiler <- R6::R6Class(
283281
extension_function$function_anchor
284282
},
285283

284+
#' @description
285+
#' Get the extension uri anchor value for a given function
286+
#'
287+
#' @param name The name of the function
288+
#'
289+
#' @return The uri anchor value
290+
extension_uri_anchor = function(name) {
291+
private$extension_uri[[1]]$extension_uri_anchor
292+
},
293+
286294
#' @description
287295
#' Retrieve a function extension by anchor/reference
288296
#'

R/pkg-arrow.R

+40-8
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,48 @@
22
ArrowSubstraitCompiler <- R6::R6Class(
33
"ArrowSubstraitCompiler",
44
inherit = SubstraitCompiler,
5+
private = list(extension_uri = NULL),
56
public = list(
67
initialize = function(...) {
78
super$initialize(...)
8-
self$.fns = arrow_funs
9+
self$.fns <- arrow_funs
10+
private$extension_uri <- list(
11+
"arithmetic" = substrait$extensions$SimpleExtensionURI$create(
12+
extension_uri_anchor = 1L,
13+
uri = "https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml"
14+
),
15+
"comparison" = substrait$extensions$SimpleExtensionURI$create(
16+
extension_uri_anchor = 2L,
17+
uri = "https://github.com/substrait-io/substrait/blob/main/extensions/functions_comparison.yaml"
18+
)
19+
)
20+
},
21+
extension_uri_anchor = function(name) {
22+
prefix <- strsplit(name, ".", fixed = TRUE)[[1]][1]
23+
private$extension_uri[[prefix]]$extension_uri_anchor
924
},
1025
evaluate = function(...) {
1126
plan <- self$plan()
1227

13-
# Here we only implement 'add()', so this works because the only
14-
# function that we ever use is contained in this extensions definition.
15-
plan$extension_uris[[1]]$uri <-
16-
"https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml"
17-
1828
substrait_eval_arrow(
1929
plan = plan,
2030
tables = self$named_table_list(),
2131
col_names = self$schema$names
2232
)
33+
},
34+
plan = function() {
35+
plan <- super$plan()
36+
37+
for (i in seq_along(plan$extensions)) {
38+
if (is.null(plan$extensions[[i]]$extension_function)) {
39+
next
40+
}
41+
42+
short_name <- strsplit(plan$extensions[[i]]$extension_function$name, ".", fixed = TRUE)[[1]][2]
43+
plan$extensions[[i]]$extension_function$name <- short_name
44+
}
45+
46+
plan
2347
}
2448
)
2549
)
@@ -29,7 +53,7 @@ arrow_funs <- new.env(parent = emptyenv())
2953

3054
arrow_funs[["+"]] <- function(lhs, rhs) {
3155
substrait_call(
32-
"add",
56+
"arithmetic.add",
3357
substrait$FunctionArgument$create(
3458
enum_ = substrait$FunctionArgument$Enum$create(unspecified = substrait_proto_auto())
3559
),
@@ -39,6 +63,15 @@ arrow_funs[["+"]] <- function(lhs, rhs) {
3963
)
4064
}
4165

66+
arrow_funs[[">"]] <- function(lhs, rhs) {
67+
substrait_call(
68+
"comparison.gt",
69+
lhs,
70+
rhs,
71+
.output_type = substrait_boolean()
72+
)
73+
}
74+
4275
#' Create an Arrow Substrait Compiler
4376
#'
4477
#' @param object A [data.frame()], [arrow::Table], [arrow::RecordBatch],
@@ -198,7 +231,6 @@ from_substrait.RecordBatch <- function(msg, x, ...) {
198231

199232
substrait_eval_arrow <- function(plan, tables, col_names) {
200233
stopifnot(has_arrow_with_substrait())
201-
202234
plan <- as_substrait(plan, "substrait.Plan")
203235
stopifnot(rlang::is_named2(tables))
204236

R/pkg-dplyr.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#' @importFrom dplyr select
2525
#' @export
2626
#'
27-
#' @examples
27+
#' @examplesIf has_duckdb_with_substrait()
2828
#' library(dplyr)
2929
#' compiler <- duckdb_substrait_compiler(mtcars)
3030
#'

man/SubstraitCompiler.Rd

+21
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/select.SubstraitCompiler.Rd

+2-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-dplyr-arrange.R

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
library(dplyr, warn.conflicts = FALSE)
22
skip_if_not(has_arrow_with_substrait())
3+
skip_if_not(has_duckdb_with_substrait())
34

45

56
# randomize order of rows in test data

tests/testthat/test-dplyr-filter.R

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
library(dplyr, warn.conflicts = FALSE)
22
library(stringr)
33
skip_if_not(has_arrow_with_substrait())
4+
skip_if_not(has_duckdb_with_substrait())
45

56
test_that("filter() on is.na()", {
67

@@ -50,8 +51,6 @@ test_that("filtering with expression", {
5051

5152
test_that("filtering with arithmetic", {
5253
compare_dplyr_binding(
53-
# skip("arithmetic functions not yet implemented: https://github.com/voltrondata/substrait-r/issues/20")
54-
engine = "duckdb",
5554
.input %>%
5655
filter(some_negative + 1 > 3) %>%
5756
select(string = chr, int, dbl) %>%

tests/testthat/test-dplyr-group_by.R

+4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
library(dplyr, warn.conflicts = FALSE)
22
library(stringr)
33
skip_if_not(has_arrow_with_substrait())
4+
skip_if_not(has_duckdb_with_substrait())
45

56
example_with_logical_factors <- tibble::tibble(
67
starting_a_fight = factor(c(FALSE, TRUE, TRUE, TRUE)),
@@ -36,12 +37,15 @@ test_that("group_by supports creating/renaming", {
3637
)
3738

3839
compare_dplyr_binding(
40+
engine = "duckdb",
3941
.input %>%
4042
group_by(chr, numbers = int * 4) %>%
4143
collect(),
4244
example_data
4345
)
46+
4447
compare_dplyr_binding(
48+
engine = "duckdb",
4549
.input %>%
4650
group_by(int > 4, lgl, foo = int > 5) %>%
4751
collect(),

tests/testthat/test-dplyr-mutate.R

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
library(dplyr, warn.conflicts = FALSE)
22
library(arrow)
33
skip_if_not(has_arrow_with_substrait())
4+
skip_if_not(has_duckdb_with_substrait())
45

56
library(stringr)
67

tests/testthat/test-dplyr-select.R

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
library(dplyr, warn.conflicts = FALSE)
22
library(arrow)
33
skip_if_not(has_arrow_with_substrait())
4+
skip_if_not(has_duckdb_with_substrait())
45

56
test_that("Empty select returns no columns", {
67
skip("Arrow - https://github.com/voltrondata/substrait-r/issues/51")

tests/testthat/test-dplyr-summarise.R

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ withr::local_options(list(
77
))
88

99
skip_if_not(has_arrow_with_substrait())
10+
skip_if_not(has_duckdb_with_substrait())
1011

1112
library(dplyr, warn.conflicts = FALSE)
1213
library(stringr)

tests/testthat/test-pkg-arrow.R

+32
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,35 @@
1+
skip_if_not(has_arrow_with_substrait())
2+
3+
test_that("ArrowSubstraitCompiler$plan() generates the correct extension URIs", {
4+
df <- tibble::tibble(x = 1:3)
5+
6+
compiler <- arrow_substrait_compiler(df) %>%
7+
substrait_select(x1 = x > 2, x2 = x + 2)
8+
9+
plan <- compiler$plan()
10+
expect_length(plan$extension_uris, 2)
11+
12+
expect_identical(plan$extensions[[1]]$extension_function$name, "gt")
13+
expect_identical(
14+
plan$extensions[[1]]$extension_function$extension_uri_reference,
15+
# uri reference for "add"
16+
2
17+
)
18+
19+
expect_identical(plan$extensions[[2]]$extension_function$name, "add")
20+
expect_identical(
21+
plan$extensions[[2]]$extension_function$extension_uri_reference,
22+
# uri reference for "comparison"
23+
1
24+
)
25+
26+
out_df <- as.data.frame(compiler$evaluate())
27+
28+
expect_identical(
29+
out_df,
30+
tibble::tibble(x1 = c(FALSE, FALSE, TRUE), x2 = c(3, 4, 5))
31+
)
32+
})
133

234
test_that("substrait_compiler() creates an ArrowSubstraitCompiler for ArrowTabular", {
335
rb <- arrow::record_batch(

tests/testthat/test-pkg-duckdb.R

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
skip_if_not(has_duckdb_with_substrait())
12

23
test_that("duckdb_substrait_compiler() works", {
34
skip_if_not(has_duckdb_with_substrait())

0 commit comments

Comments
 (0)