Skip to content

Commit f2c3928

Browse files
GH-34519: [C++][R] Fix dataset scans that project the same name as a field (#34576)
### Rationale for this change Fixes #34519. #33770 introduced the bug; I had [asked](https://github.com/apache/arrow/pull/33770/files#r1081612013) in the review why the C++ function wasn't using `FieldsInExpression`. I swapped that in, and the test I added to reproduce the bug now passes. ### What changes are included in this PR? Fix for the C++ function, test in R. ### Are these changes tested? Yes ### Are there any user-facing changes? The behavior observed in the report no longer happens. * Closes: #34519 Authored-by: Neal Richardson <neal.p.richardson@gmail.com> Signed-off-by: Weston Pace <weston.pace@gmail.com>
1 parent 1f8a335 commit f2c3928

File tree

2 files changed

+30
-16
lines changed

2 files changed

+30
-16
lines changed

cpp/src/arrow/dataset/scanner.cc

+11-13
Original file line numberDiff line numberDiff line change
@@ -141,19 +141,17 @@ Result<std::shared_ptr<Schema>> GetProjectedSchemaFromExpression(
141141
if (call->function_name != "make_struct") {
142142
return Status::Invalid("Top level projection expression call must be make_struct");
143143
}
144-
for (const compute::Expression& arg : call->arguments) {
145-
if (auto field_ref = arg.field_ref()) {
146-
if (field_ref->IsName()) {
147-
field_names.emplace(*field_ref->name());
148-
} else if (field_ref->IsNested()) {
149-
// We keep the top-level field name.
150-
auto nested_field_refs = *field_ref->nested_refs();
151-
field_names.emplace(*nested_field_refs[0].name());
152-
} else {
153-
return Status::Invalid(
154-
"No projected schema was supplied and we could not infer the projected "
155-
"schema from the projection expression.");
156-
}
144+
for (auto field_ref : compute::FieldsInExpression(projection)) {
145+
if (field_ref.IsName()) {
146+
field_names.emplace(*field_ref.name());
147+
} else if (field_ref.IsNested()) {
148+
// We keep the top-level field name.
149+
auto nested_field_refs = *field_ref.nested_refs();
150+
field_names.emplace(*nested_field_refs[0].name());
151+
} else {
152+
return Status::Invalid(
153+
"No projected schema was supplied and we could not infer the projected "
154+
"schema from the projection expression.");
157155
}
158156
}
159157
}

r/tests/testthat/test-dplyr-query.R

+19-3
Original file line numberDiff line numberDiff line change
@@ -740,12 +740,19 @@ test_that("Can use nested field refs", {
740740
collect(),
741741
nested_data
742742
)
743+
})
743744

744-
# Now with Dataset: make sure column pushdown in ScanNode works
745+
test_that("Can use nested field refs with Dataset", {
745746
skip_if_not_available("dataset")
747+
# Now with Dataset: make sure column pushdown in ScanNode works
748+
nested_data <- tibble(int = 1:5, df_col = tibble(a = 6:10, b = 11:15))
749+
tf <- tempfile()
750+
dir.create(tf)
751+
write_dataset(nested_data, tf)
752+
ds <- open_dataset(tf)
753+
746754
expect_equal(
747-
nested_data %>%
748-
InMemoryDataset$create() %>%
755+
ds %>%
749756
mutate(
750757
nested = df_col$a,
751758
times2 = df_col$a * 2
@@ -759,6 +766,15 @@ test_that("Can use nested field refs", {
759766
) %>%
760767
filter(nested > 7)
761768
)
769+
# Issue #34519: error when projecting same name, but only on file dataset
770+
expect_equal(
771+
ds %>%
772+
mutate(int = as.numeric(int)) %>%
773+
collect(),
774+
nested_data %>%
775+
mutate(int = as.numeric(int)) %>%
776+
collect()
777+
)
762778
})
763779

764780
test_that("Use struct_field for $ on non-field-ref", {

0 commit comments

Comments
 (0)