[r] Select first and last row from grouped data

Question

Using dplyr, how do I select the top and bottom observations/rows of grouped data in one statement?

Data & Example

Given a data frame

df <- data.frame(id=c(1,1,1,2,2,2,3,3,3), 
                 stopId=c("a","b","c","a","b","c","a","b","c"), 
                 stopSequence=c(1,2,3,3,1,4,3,1,2))

I can get the top and bottom observations from each group using slice, but using two separate statments:

firstStop <- df %>%
  group_by(id) %>%
  arrange(stopSequence) %>%
  slice(1) %>%
  ungroup

lastStop <- df %>%
  group_by(id) %>%
  arrange(stopSequence) %>%
  slice(n()) %>%
  ungroup

Can I combine these two statmenets into one that selects both top and bottom observations?

This question is related to r dplyr

The answer is


A different base R alternative would be to first order by id and stopSequence, split them based on id and for every id we select only the first and last index and subset the dataframe using those indices.

df[sapply(with(df, split(order(id, stopSequence), id)), function(x) 
                   c(x[1], x[length(x)])), ]


#  id stopId stopSequence
#1  1      a            1
#3  1      c            3
#5  2      b            1
#6  2      c            4
#8  3      b            1
#7  3      a            3

Or similar using by

df[unlist(with(df, by(order(id, stopSequence), id, function(x) 
                   c(x[1], x[length(x)])))), ]

Just for completeness: You can pass slice a vector of indices:

df %>% arrange(stopSequence) %>% group_by(id) %>% slice(c(1,n()))

which gives

  id stopId stopSequence
1  1      a            1
2  1      c            3
3  2      b            1
4  2      c            4
5  3      b            1
6  3      a            3

Using data.table:

# convert to data.table
setDT(df) 
# order, group, filter
df[order(stopSequence)][, .SD[c(1, .N)], by = id]

   id stopId stopSequence
1:  1      a            1
2:  1      c            3
3:  2      b            1
4:  2      c            4
5:  3      b            1
6:  3      a            3

Something like:

library(dplyr)

df <- data.frame(id=c(1,1,1,2,2,2,3,3,3),
                 stopId=c("a","b","c","a","b","c","a","b","c"),
                 stopSequence=c(1,2,3,3,1,4,3,1,2))

first_last <- function(x) {
  bind_rows(slice(x, 1), slice(x, n()))
}

df %>%
  group_by(id) %>%
  arrange(stopSequence) %>%
  do(first_last(.)) %>%
  ungroup

## Source: local data frame [6 x 3]
## 
##   id stopId stopSequence
## 1  1      a            1
## 2  1      c            3
## 3  2      b            1
## 4  2      c            4
## 5  3      b            1
## 6  3      a            3

With do you can pretty much perform any number of operations on the group but @jeremycg's answer is way more appropriate for just this task.


I know the question specified dplyr. But, since others already posted solutions using other packages, I decided to have a go using other packages too:

Base package:

df <- df[with(df, order(id, stopSequence, stopId)), ]
merge(df[!duplicated(df$id), ], 
      df[!duplicated(df$id, fromLast = TRUE), ], 
      all = TRUE)

data.table:

df <-  setDT(df)
df[order(id, stopSequence)][, .SD[c(1,.N)], by=id]

sqldf:

library(sqldf)
min <- sqldf("SELECT id, stopId, min(stopSequence) AS StopSequence
      FROM df GROUP BY id 
      ORDER BY id, StopSequence, stopId")
max <- sqldf("SELECT id, stopId, max(stopSequence) AS StopSequence
      FROM df GROUP BY id 
      ORDER BY id, StopSequence, stopId")
sqldf("SELECT * FROM min
      UNION
      SELECT * FROM max")

In one query:

sqldf("SELECT * 
        FROM (SELECT id, stopId, min(stopSequence) AS StopSequence
              FROM df GROUP BY id 
              ORDER BY id, StopSequence, stopId)
        UNION
        SELECT *
        FROM (SELECT id, stopId, max(stopSequence) AS StopSequence
              FROM df GROUP BY id 
              ORDER BY id, StopSequence, stopId)")

Output:

  id stopId StopSequence
1  1      a            1
2  1      c            3
3  2      b            1
4  2      c            4
5  3      a            3
6  3      b            1

Not dplyr, but it's much more direct using data.table:

library(data.table)
setDT(df)
df[ df[order(id, stopSequence), .I[c(1L,.N)], by=id]$V1 ]
#    id stopId stopSequence
# 1:  1      a            1
# 2:  1      c            3
# 3:  2      b            1
# 4:  2      c            4
# 5:  3      b            1
# 6:  3      a            3

More detailed explanation:

# 1) get row numbers of first/last observations from each group
#    * basically, we sort the table by id/stopSequence, then,
#      grouping by id, name the row numbers of the first/last
#      observations for each id; since this operation produces
#      a data.table
#    * .I is data.table shorthand for the row number
#    * here, to be maximally explicit, I've named the variable V1
#      as row_num to give other readers of my code a clearer
#      understanding of what operation is producing what variable
first_last = df[order(id, stopSequence), .(row_num = .I[c(1L,.N)]), by=id]
idx = first_last$row_num

# 2) extract rows by number
df[idx]

Be sure to check out the Getting Started wiki for getting the data.table basics covered


using which.min and which.max :

library(dplyr, warn.conflicts = F)
df %>% 
  group_by(id) %>% 
  slice(c(which.min(stopSequence), which.max(stopSequence)))

#> # A tibble: 6 x 3
#> # Groups:   id [3]
#>      id stopId stopSequence
#>   <dbl> <fct>         <dbl>
#> 1     1 a                 1
#> 2     1 c                 3
#> 3     2 b                 1
#> 4     2 c                 4
#> 5     3 b                 1
#> 6     3 a                 3

benchmark

It is also much faster than the current accepted answer because we find the min and max value by group, instead of sorting the whole stopSequence column.

# create a 100k times longer data frame
df2 <- bind_rows(replicate(1e5, df, F)) 
bench::mark(
  mm =df2 %>% 
    group_by(id) %>% 
    slice(c(which.min(stopSequence), which.max(stopSequence))),
  jeremy = df2 %>%
    group_by(id) %>%
    arrange(stopSequence) %>%
    filter(row_number()==1 | row_number()==n()))
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 x 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 mm           22.6ms     27ms     34.9     14.2MB     21.3
#> 2 jeremy      254.3ms    273ms      3.66    58.4MB     11.0

Another approach with lapply and a dplyr statement. We can apply an arbitrary number of whatever summary functions to the same statement:

lapply(c(first, last), 
       function(x) df %>% group_by(id) %>% summarize_all(funs(x))) %>% 
bind_rows()

You could for example be interested in rows with the max stopSequence value as well and do:

lapply(c(first, last, max("stopSequence")), 
       function(x) df %>% group_by(id) %>% summarize_all(funs(x))) %>%
bind_rows()