Subsetting • tibble

There are many, many ways to subset data frames and tibbles.

This vignette is an attempt to provide a comprehensive overview over the behavior of the subsetting operators $, [[ and [, highlighting where the tibble implementation differs from the data frame implementation.

library(tibble)
new_df <- function() {
  df <- data.frame(a = 1:4)
  df$b <- letters[5:8]
  df$cd <- list(9, 10:11, 12:14, "text")
  df
}
new_tbl <- function() {
  as_tibble(new_df())
}

Results of the same code for data frames and tibbles are presented side by side:

new_df()
#>   a b         cd
#> 1 1 e          9
#> 2 2 f     10, 11
#> 3 3 g 12, 13, 14
#> 4 4 h       text

new_tbl()
#> # A tibble: 4 × 3
#>       a b     cd       
#>   <int> <chr> <list>   
#> 1     1 e     <dbl [1]>
#> 2     2 f     <int [2]>
#> 3     3 g     <int [3]>
#> 4     4 h     <chr [1]>

In the following, if the results are identical (after converting to a data frame if necessary), only the tibble result is shown, as in the example below. This allows to spot differences easier.

new_tbl()
#> # A tibble: 4 × 3
#>       a b     cd       
#>   <int> <chr> <list>   
#> 1     1 e     <dbl [1]>
#> 2     2 f     <int [2]>
#> 3     3 g     <int [3]>
#> 4     4 h     <chr [1]>

Subsetting operations are read-only. The same objects are reused in all examples:

df <- new_df()
tbl <- new_tbl()

$

With $ subsetting, accessing a missing column gives a warning. Inexact matching is not supported:

	`tbl$a #> [1] 1 2 3 4`
	`tbl$"a" #> [1] 1 2 3 4`
	`tbl$a[2:3] #> [1] 2 3`
	`tbl$cd #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"`
`df$c #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"`	tbl$c #> Warning: Unknown or uninitialised #> column: `c`. #> NULL
`df$d #> NULL`	tbl$d #> Warning: Unknown or uninitialised #> column: `d`. #> NULL

[[

The exact argument is not supported by tibbles.

	`tbl[["a"]] #> [1] 1 2 3 4`
	`tbl[["cd", exact = TRUE]] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"`
`df[["cd", exact = FALSE]] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"`	tbl[["cd", exact = FALSE]] #> Warning: `exact` ignored. #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"
	`tbl[["c", exact = TRUE]] #> NULL`
`df[["c", exact = FALSE]] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"`	tbl[["c", exact = FALSE]] #> Warning: `exact` ignored. #> NULL

With two indexes, a single element is returned. List columns are not unpacked by tibbles, the [[ only unpacks columns.

	`tbl[[2, "a"]] #> [1] 2`
`df[[2, "cd"]] #> [1] 10 11`	`tbl[[2, "cd"]] #> [[1]] #> [1] 10 11`
df[[1:2, "cd"]] #> Error in `col[[i, exact = exact]]`: #> ! subscript out of bounds	tbl[[1:2, "cd"]] #> Error in `tbl[[1:2, "cd"]]`: #> ! Can't extract row with `1:2`. #> ✖ Subscript `1:2` must be size 1, not 2.
	`tbl[[2, "c"]] #> NULL`
`df[[1:2, "c"]] #> NULL`	tbl[[1:2, "c"]] #> Error in `tbl[[1:2, "c"]]`: #> ! Can't extract row with `1:2`. #> ✖ Subscript `1:2` must be size 1, not 2.

Exotic variants like recursive indexing are deprecated for tibbles.

df[[c(1, 2)]]
#> [1] 2

tbl[[c(1, 2)]]
#> Error:
#> ! The `j` argument of
#>   `[[.tbl_df()` can't be a vector
#>   of length 2 as of tibble 3.0.0.
#> ℹ Recursive subsetting is
#>   deprecated for tibbles.

[

With [ subsetting, tibbles always return a tibble. The drop argument is supported but has different defaults:

	`tbl["a"] #> # A tibble: 4 × 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4`
df["a", drop = FALSE] #> Warning in `[.data.frame`(df, "a", #> drop = FALSE): 'drop' argument will #> be ignored #> a #> 1 1 #> 2 2 #> 3 3 #> 4 4	tbl["a", drop = FALSE] #> Warning: `drop` argument ignored #> for subsetting a tibble with #> `x[j]`, it has an effect only for #> `x[i, j]`. #> # A tibble: 4 × 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4
df["a", drop = TRUE] #> Warning in `[.data.frame`(df, "a", #> drop = TRUE): 'drop' argument will #> be ignored #> a #> 1 1 #> 2 2 #> 3 3 #> 4 4	tbl["a", drop = TRUE] #> Warning: `drop` argument ignored #> for subsetting a tibble with #> `x[j]`, it has an effect only for #> `x[i, j]`. #> # A tibble: 4 × 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4
	`tbl[1] #> # A tibble: 4 × 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4`
	`tbl[0] #> # A tibble: 4 × 0`
df[4] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[4] #> Error in `tbl[4]`: #> ! Can't subset columns past the end. #> ℹ Location 4 doesn't exist. #> ℹ There are only 3 columns.
df[NA] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[NA] #> Error in `tbl[NA]`: #> ! Can't subset columns with `NA`. #> ✖ Subscript `NA` can't contain missing values. #> ✖ It has a missing value at location 1.
df[NA_character_] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[NA_character_] #> Error in `tbl[NA_character_]`: #> ! Can't subset columns with `NA_character_`. #> ✖ Subscript `NA_character_` can't contain missing values. #> ✖ It has a missing value at location 1.
df[NA_integer_] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[NA_integer_] #> Error in `tbl[NA_integer_]`: #> ! Can't subset columns with `NA_integer_`. #> ✖ Subscript `NA_integer_` can't contain missing values. #> ✖ It has a missing value at location 1.

The same examples are repeated for two-dimensional indexing when omitting the row index:

`df[, "a"] #> [1] 1 2 3 4`	`tbl[, "a"] #> # A tibble: 4 × 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4`
	`tbl[, "a", drop = FALSE] #> # A tibble: 4 × 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4`
	`tbl[, "a", drop = TRUE] #> [1] 1 2 3 4`
`df[, 1] #> [1] 1 2 3 4`	`tbl[, 1] #> # A tibble: 4 × 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4`
	`tbl[, 0] #> # A tibble: 4 × 0`
df[, 4] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[, 4] #> Error in `tbl[, 4]`: #> ! Can't subset columns past the end. #> ℹ Location 4 doesn't exist. #> ℹ There are only 3 columns.
df[, NA] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[, NA] #> Error in `tbl[, NA]`: #> ! Can't subset columns with `NA`. #> ✖ Subscript `NA` can't contain missing values. #> ✖ It has a missing value at location 1.
df[, NA_character_] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[, NA_character_] #> Error in `tbl[, NA_character_]`: #> ! Can't subset columns with `NA_character_`. #> ✖ Subscript `NA_character_` can't contain missing values. #> ✖ It has a missing value at location 1.
df[, NA_integer_] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[, NA_integer_] #> Error in `tbl[, NA_integer_]`: #> ! Can't subset columns with `NA_integer_`. #> ✖ Subscript `NA_integer_` can't contain missing values. #> ✖ It has a missing value at location 1.

Multiple columns can be queried by passing a vector of column indexes (names, positions, or even a logical vector). With the latter option, tibbles are a tad stricter:

	`tbl[c("a", "b")] #> # A tibble: 4 × 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h`
	`tbl[character()] #> # A tibble: 4 × 0`
	`tbl[1:2] #> # A tibble: 4 × 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h`
	`tbl[1:3] #> # A tibble: 4 × 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]>`
df[1:4] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[1:4] #> Error in `tbl[1:4]`: #> ! Can't subset columns past the end. #> ℹ Location 4 doesn't exist. #> ℹ There are only 3 columns.
	`tbl[0:2] #> # A tibble: 4 × 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h`
df[-1:2] #> Error in `[.default`: #> ! only 0's may be mixed with negative subscripts	tbl[-1:2] #> Error in `tbl[-1:2]`: #> ! Can't subset columns with `-1:2`. #> ✖ Negative and positive locations can't be mixed. #> ℹ Subscript `-1:2` has 2 positive values at locations 3 and 4.
	`tbl[-1] #> # A tibble: 4 × 2 #> b cd #> <chr> <list> #> 1 e <dbl [1]> #> 2 f <int [2]> #> 3 g <int [3]> #> 4 h <chr [1]>`
	`tbl[-(1:2)] #> # A tibble: 4 × 1 #> cd #> <list> #> 1 <dbl [1]> #> 2 <int [2]> #> 3 <int [3]> #> 4 <chr [1]>`
	`tbl[integer()] #> # A tibble: 4 × 0`
	`tbl[TRUE] #> # A tibble: 4 × 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]>`
	`tbl[FALSE] #> # A tibble: 4 × 0`
	`tbl[c(TRUE, TRUE, FALSE)] #> # A tibble: 4 × 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h`
	`tbl[c(FALSE, TRUE, FALSE)] #> # A tibble: 4 × 1 #> b #> <chr> #> 1 e #> 2 f #> 3 g #> 4 h`
`df[c(FALSE, TRUE)] #> b #> 1 e #> 2 f #> 3 g #> 4 h`	tbl[c(FALSE, TRUE)] #> Error in `tbl[c(FALSE, TRUE)]`: #> ! Can't subset columns with `c(FALSE, TRUE)`. #> ✖ Logical subscript `c(FALSE, TRUE)` must be size 1 or 3, not 2.
df[c(FALSE, TRUE, FALSE, TRUE)] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[c(FALSE, TRUE, FALSE, TRUE)] #> Error in `tbl[c(FALSE, TRUE, FALSE, TRUE)]`: #> ! Can't subset columns with `c(FALSE, TRUE, FALSE, TRUE)`. #> ✖ Logical subscript `c(FALSE, TRUE, FALSE, TRUE)` must be size 1 or 3, not 4.

The same examples are repeated for two-dimensional indexing when omitting the row index:

	`tbl[, c("a", "b")] #> # A tibble: 4 × 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h`
	`tbl[, character()] #> # A tibble: 4 × 0`
	`tbl[, 1:2] #> # A tibble: 4 × 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h`
	`tbl[, 1:3] #> # A tibble: 4 × 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]>`
df[, 1:4] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[, 1:4] #> Error in `tbl[, 1:4]`: #> ! Can't subset columns past the end. #> ℹ Location 4 doesn't exist. #> ℹ There are only 3 columns.
	`tbl[, 0:2] #> # A tibble: 4 × 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h`
df[, -1:2] #> Error in `.subset()`: #> ! only 0's may be mixed with negative subscripts	tbl[, -1:2] #> Error in `tbl[, -1:2]`: #> ! Can't subset columns with `-1:2`. #> ✖ Negative and positive locations can't be mixed. #> ℹ Subscript `-1:2` has 2 positive values at locations 3 and 4.
	`tbl[, -1] #> # A tibble: 4 × 2 #> b cd #> <chr> <list> #> 1 e <dbl [1]> #> 2 f <int [2]> #> 3 g <int [3]> #> 4 h <chr [1]>`
`df[, -(1:2)] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"`	`tbl[, -(1:2)] #> # A tibble: 4 × 1 #> cd #> <list> #> 1 <dbl [1]> #> 2 <int [2]> #> 3 <int [3]> #> 4 <chr [1]>`
	`tbl[, integer()] #> # A tibble: 4 × 0`
	`tbl[, TRUE] #> # A tibble: 4 × 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]>`
	`tbl[, FALSE] #> # A tibble: 4 × 0`
	`tbl[, c(TRUE, TRUE, FALSE)] #> # A tibble: 4 × 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h`
`df[, c(FALSE, TRUE, FALSE)] #> [1] "e" "f" "g" "h"`	`tbl[, c(FALSE, TRUE, FALSE)] #> # A tibble: 4 × 1 #> b #> <chr> #> 1 e #> 2 f #> 3 g #> 4 h`
`df[, c(FALSE, TRUE)] #> [1] "e" "f" "g" "h"`	tbl[, c(FALSE, TRUE)] #> Error in `tbl[, c(FALSE, TRUE)]`: #> ! Can't subset columns with `c(FALSE, TRUE)`. #> ✖ Logical subscript `c(FALSE, TRUE)` must be size 1 or 3, not 2.
df[, c(FALSE, TRUE, FALSE, TRUE)] #> Error in `[.data.frame`: #> ! undefined columns selected	tbl[, c(FALSE, TRUE, FALSE, TRUE)] #> Error in `tbl[, c(FALSE, TRUE, FALSE, TRUE)]`: #> ! Can't subset columns with `c(FALSE, TRUE, FALSE, TRUE)`. #> ✖ Logical subscript `c(FALSE, TRUE, FALSE, TRUE)` must be size 1 or 3, not 4.

Row subsetting with integer indexes works almost identical. Out-of-bounds subsetting is not recommended and may lead to an error in future versions. Another special case is subsetting with [1, , drop = TRUE] where the data frame implementation returns a list.

	`tbl[1, ] #> # A tibble: 1 × 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]>`
`df[1, , drop = TRUE] #> $a #> [1] 1 #> #> $b #> [1] "e" #> #> $cd #> $cd[[1]] #> [1] 9`	`tbl[1, , drop = TRUE] #> # A tibble: 1 × 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]>`
	`tbl[1:2, ] #> # A tibble: 2 × 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]>`
	`tbl[0, ] #> # A tibble: 0 × 3 #> # ℹ 3 variables: a <int>, b <chr>, #> # cd <list>`
	`tbl[integer(), ] #> # A tibble: 0 × 3 #> # ℹ 3 variables: a <int>, b <chr>, #> # cd <list>`
	`tbl[5, ] #> # A tibble: 1 × 3 #> a b cd #> <int> <chr> <list> #> 1 NA NA <NULL>`
	`tbl[4:5, ] #> # A tibble: 2 × 3 #> a b cd #> <int> <chr> <list> #> 1 4 h <chr [1]> #> 2 NA NA <NULL>`
	`tbl[-1, ] #> # A tibble: 3 × 3 #> a b cd #> <int> <chr> <list> #> 1 2 f <int [2]> #> 2 3 g <int [3]> #> 3 4 h <chr [1]>`
df[-1:2, ] #> Error in `xj[i]`: #> ! only 0's may be mixed with negative subscripts	tbl[-1:2, ] #> Error in `tbl[-1:2, ]`: #> ! Can't subset rows with `-1:2`. #> ✖ Negative and positive locations can't be mixed. #> ℹ Subscript `-1:2` has 2 positive values at locations 3 and 4.
	`tbl[NA, ] #> # A tibble: 4 × 3 #> a b cd #> <int> <chr> <list> #> 1 NA NA <NULL> #> 2 NA NA <NULL> #> 3 NA NA <NULL> #> 4 NA NA <NULL>`
	`tbl[NA_integer_, ] #> # A tibble: 1 × 3 #> a b cd #> <int> <chr> <list> #> 1 NA NA <NULL>`
	`tbl[c(NA, 1), ] #> # A tibble: 2 × 3 #> a b cd #> <int> <chr> <list> #> 1 NA NA <NULL> #> 2 1 e <dbl [1]>`

Row subsetting with logical indexes also works almost identical, the index vector must have length one or the number of rows with tibbles.

	`tbl[TRUE, ] #> # A tibble: 4 × 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]>`
	`tbl[FALSE, ] #> # A tibble: 0 × 3 #> # ℹ 3 variables: a <int>, b <chr>, #> # cd <list>`
`df[c(TRUE, FALSE), ] #> a b cd #> 1 1 e 9 #> 3 3 g 12, 13, 14`	tbl[c(TRUE, FALSE), ] #> Error in `tbl[c(TRUE, FALSE), ]`: #> ! Can't subset rows with `c(TRUE, FALSE)`. #> ✖ Logical subscript `c(TRUE, FALSE)` must be size 1 or 4, not 2.
`df[c(TRUE, FALSE, TRUE), ] #> a b cd #> 1 1 e 9 #> 3 3 g 12, 13, 14 #> 4 4 h text`	tbl[c(TRUE, FALSE, TRUE), ] #> Error in `tbl[c(TRUE, FALSE, TRUE), ]`: #> ! Can't subset rows with `c(TRUE, FALSE, TRUE)`. #> ✖ Logical subscript `c(TRUE, FALSE, TRUE)` must be size 1 or 4, not 3.
	`tbl[c(TRUE, FALSE, TRUE, FALSE), ] #> # A tibble: 2 × 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 3 g <int [3]>`
`df[c(TRUE, FALSE, TRUE, FALSE, TRUE), ] #> a b cd #> 1 1 e 9 #> 3 3 g 12, 13, 14 #> NA NA <NA> NULL`	tbl[c(TRUE, FALSE, TRUE, FALSE, TRUE), ] #> Error in `tbl[c(TRUE, FALSE, TRUE, FALSE, #> TRUE), ]`: #> ! Can't subset rows with `c(TRUE, FALSE, TRUE, FALSE, TRUE)`. #> ✖ Logical subscript `c(TRUE, FALSE, TRUE, FALSE, TRUE)` must be size 1 or 4, not 5.

Indexing both row and column works more or less the same, except for drop:

`df[1, "a"] #> [1] 1`	`tbl[1, "a"] #> # A tibble: 1 × 1 #> a #> <int> #> 1 1`
	`tbl[1, "a", drop = FALSE] #> # A tibble: 1 × 1 #> a #> <int> #> 1 1`
	`tbl[1, "a", drop = TRUE] #> [1] 1`
`df[1:2, "a"] #> [1] 1 2`	`tbl[1:2, "a"] #> # A tibble: 2 × 1 #> a #> <int> #> 1 1 #> 2 2`
	`tbl[1:2, "a", drop = FALSE] #> # A tibble: 2 × 1 #> a #> <int> #> 1 1 #> 2 2`
	`tbl[1:2, "a", drop = TRUE] #> [1] 1 2`
	`tbl[1, c("a", "b")] #> # A tibble: 1 × 2 #> a b #> <int> <chr> #> 1 1 e`
	`tbl[1, c("a", "b"), drop = FALSE] #> # A tibble: 1 × 2 #> a b #> <int> <chr> #> 1 1 e`
`df[1, c("a", "b"), drop = TRUE] #> $a #> [1] 1 #> #> $b #> [1] "e"`	`tbl[1, c("a", "b"), drop = TRUE] #> # A tibble: 1 × 2 #> a b #> <int> <chr> #> 1 1 e`
	`tbl[1:2, c("a", "b")] #> # A tibble: 2 × 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f`

Indexes can be omitted altogether, no differences here:

tbl[]
#> # A tibble: 4 × 3
#>       a b     cd       
#>   <int> <chr> <list>   
#> 1     1 e     <dbl [1]>
#> 2     2 f     <int [2]>
#> 3     3 g     <int [3]>
#> 4     4 h     <chr [1]>

tbl[,]
#> # A tibble: 4 × 3
#>       a b     cd       
#>   <int> <chr> <list>   
#> 1     1 e     <dbl [1]>
#> 2     2 f     <int [2]>
#> 3     3 g     <int [3]>
#> 4     4 h     <chr [1]>