summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/R/dplyr-select.R
blob: 9a867ced964c923f960c3459fc2cf0141750c207 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


# The following S3 methods are registered on load if dplyr is present

tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns)

select.arrow_dplyr_query <- function(.data, ...) {
  check_select_helpers(enexprs(...))
  column_select(as_adq(.data), !!!enquos(...))
}
select.Dataset <- select.ArrowTabular <- select.arrow_dplyr_query

rename.arrow_dplyr_query <- function(.data, ...) {
  check_select_helpers(enexprs(...))
  column_select(as_adq(.data), !!!enquos(...), .FUN = vars_rename)
}
rename.Dataset <- rename.ArrowTabular <- rename.arrow_dplyr_query

column_select <- function(.data, ..., .FUN = vars_select) {
  # .FUN is either tidyselect::vars_select or tidyselect::vars_rename
  # It operates on the names() of selected_columns, i.e. the column names
  # factoring in any renaming that may already have happened
  out <- .FUN(names(.data), !!!enquos(...))
  # Make sure that the resulting selected columns map back to the original data,
  # as in when there are multiple renaming steps
  .data$selected_columns <- set_names(.data$selected_columns[out], names(out))

  # If we've renamed columns, we need to project that renaming into other
  # query parameters we've collected
  renamed <- out[names(out) != out]
  if (length(renamed)) {
    # Massage group_by
    gbv <- .data$group_by_vars
    renamed_groups <- gbv %in% renamed
    gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)]
    .data$group_by_vars <- gbv
    # No need to massage filters because those contain references to Arrow objects
  }
  .data
}

relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL) {
  # The code in this function is adapted from the code in dplyr::relocate.data.frame
  # at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R
  # TODO: revisit this after https://github.com/tidyverse/dplyr/issues/5829

  .data <- as_adq(.data)

  # Assign the schema to the expressions
  map(.data$selected_columns, ~ (.$schema <- .data$.data$schema))

  # Create a mask for evaluating expressions in tidyselect helpers
  mask <- new_environment(.cache$functions, parent = caller_env())

  to_move <- eval_select(substitute(c(...)), .data$selected_columns, mask)

  .before <- enquo(.before)
  .after <- enquo(.after)
  has_before <- !quo_is_null(.before)
  has_after <- !quo_is_null(.after)

  if (has_before && has_after) {
    abort("Must supply only one of `.before` and `.after`.")
  } else if (has_before) {
    where <- min(unname(eval_select(quo_get_expr(.before), .data$selected_columns, mask)))
    if (!where %in% to_move) {
      to_move <- c(to_move, where)
    }
  } else if (has_after) {
    where <- max(unname(eval_select(quo_get_expr(.after), .data$selected_columns, mask)))
    if (!where %in% to_move) {
      to_move <- c(where, to_move)
    }
  } else {
    where <- 1L
    if (!where %in% to_move) {
      to_move <- c(to_move, where)
    }
  }

  lhs <- setdiff(seq2(1, where - 1), to_move)
  rhs <- setdiff(seq2(where + 1, length(.data$selected_columns)), to_move)

  pos <- vec_unique(c(lhs, to_move, rhs))
  new_names <- names(pos)
  .data$selected_columns <- .data$selected_columns[pos]

  if (!is.null(new_names)) {
    names(.data$selected_columns)[new_names != ""] <- new_names[new_names != ""]
  }
  .data
}
relocate.Dataset <- relocate.ArrowTabular <- relocate.arrow_dplyr_query

check_select_helpers <- function(exprs) {
  # Throw an error if unsupported tidyselect selection helpers in `exprs`
  exprs <- lapply(exprs, function(x) if (is_quosure(x)) quo_get_expr(x) else x)
  unsup_select_helpers <- "where"
  funs_in_exprs <- unlist(lapply(exprs, all_funs))
  unsup_funs <- funs_in_exprs[funs_in_exprs %in% unsup_select_helpers]
  if (length(unsup_funs)) {
    stop(
      "Unsupported selection ",
      ngettext(length(unsup_funs), "helper: ", "helpers: "),
      oxford_paste(paste0(unsup_funs, "()"), quote = FALSE),
      call. = FALSE
    )
  }
}