Reputation: 45

R, select rows according to the rank of a certain column

I have a R dataframe like this below,

name score
marry 98
marry 77
marry 87
marry 96
mark 99
mark 44
mark 79
john 87
john 77

For each of the name, I want to select the rows with the highest 2 score, which should be,

name score
marry 98
marry 96
mark 99
mark 79
john 87
john 77

Could anyone help? Many thanks!

Upvotes: 3

Answers (4)

akrun

Reputation: 886988

You could try:

 devtools::install_github("hadley/dplyr")
 library(dplyr)


 df %>% 
      group_by(name) %>% 
      arrange(desc(score)) %>%
       slice(1:2)

 #     name score
 #1  john    87
 #2  john    77
 #3  mark    99
 #4  mark    79
 #5 marry    98
 #6 marry    96

Or using data.table

 library(data.table)
 setDT(df)[order(-score), .SD[1:2], by=name]
 #      name score
 #1:  mark    99
 #2:  mark    79
 #3: marry    98
 #4: marry    96
 #5:  john    87
 #6:  john    77

data

df <- structure(list(name = c("marry", "marry", "marry", "marry", "mark", 
   "mark", "mark", "john", "john"), score = c(98L, 77L, 87L, 96L, 
   99L, 44L, 79L, 87L, 77L)), .Names = c("name", "score"), class = "data.frame", row.names = c(NA, 
  -9L))

Benchmarks

Included dplyr and base R methods by @Richard Scriven. Created two datasets, 1) Same as generated by @Ananda Mahto's 2nd dataset, 2) 50x bigger.

Data

set.seed(1) #similar dataset as created by @Ananda Mahto
dfAM <- data.frame(
            name = sample(10000, 1000000, TRUE),
            score = sample(0:100, 1000000, TRUE)
            )


set.seed(1)
df2 <- data.frame(
             name = sample(10000, 50*1000000, TRUE),
             score = sample(0:100, 50*1000000, TRUE)
            )

Functions

  aMahto <- function(mydf) {mydf[with(mydf, 
             ave(-score, name, FUN = order)) %in% c(1, 2), ]
           }

  akrun1 <- function(mydf) {setDT(mydf)[order(-score), .SD[1:2], by=name] }
  akrun2 <- function(mydf) {setDT(mydf)[order(-score), head(.SD,2), by=name] }
  dArenburg <- function(mydf){ setorder(setDT(mydf), -score)[,
                                            head(.SD,2), by=name]}
  akrun3 <- function(mydf) { mydf %>% group_by(name) %>% 
                               arrange(desc(score)) %>% slice(1:2) }


  rScriven1 <- function(mydf) {sapply(split(mydf$score, mydf$name),
                                       function(x) tail(sort(x), 2))}
  rScriven2 <- function(mydf) {stack(lapply(split(mydf$score, mydf$name),
                                        function(x) tail(sort(x), 2)))}

Benchmarks Run the benchmarks for each dataset separately.

  library(microbenchmark)
  microbenchmark(aMahto(dfAM), akrun1(dfAM), akrun2(dfAM), akrun3(dfAM),
                dArenburg(dfAM), rScriven1(dfAM), rScriven2(dfAM), times=20L)
   Unit: milliseconds
         expr        min         lq       mean     median         uq
   aMahto(dfAM)  278.11839  283.82547  293.16843  285.45645  298.77528
   akrun1(dfAM) 2900.86528 2923.94839 2953.00178 2942.73172 2965.85868
   akrun2(dfAM)  189.93849  195.05222  202.30496  196.51019  207.73886
   akrun3(dfAM)   56.75191   57.14967   58.04335   57.32627   57.63047
dArenburg(dfAM)  161.87583  166.64286  171.67832  168.24355  170.14656
rScriven1(dfAM)  694.22503  701.85554  717.92201  713.15419  727.84196
rScriven2(dfAM)  712.84676  728.75839  744.07167  738.74824  759.88312
        max neval
  335.04468    20
 3113.54895    20
  250.09765    20
   66.67198    20
  228.44948    20
  777.78973    20
  797.85850    20

On the bigger dataset, @David Arenburg's method is the Winner.

    microbenchmark(aMahto(df2), akrun1(df2), akrun2(df2), akrun3(df2), 
                 dArenburg(df2), rScriven1(df2), rScriven2(df2), times=40L)
    Unit: seconds
            expr       min        lq      mean    median        uq       max
     aMahto(df2) 11.830111 12.027325 12.273881 12.213140 12.533628 13.196659
     akrun1(df2)  6.672874  6.890442  7.018749  6.956716  7.128060  7.542047
     akrun2(df2)  3.794502  3.829567  3.860565  3.847690  3.869065  4.143381
     akrun3(df2)  3.687974  3.725867  3.801861  3.743973  3.933935  4.102295
  dArenburg(df2)  1.531356  1.598570  1.647648  1.618573  1.640258  2.716042
  rScriven1(df2)  6.370144  6.573998  6.685313  6.616246  6.820830  7.118827
  rScriven2(df2)  6.551911  6.628134  6.743644  6.724310  6.867090  7.091750
     neval
      40
      40
      40
      40
      40
      40
      40

Upvotes: 4

A5C1D2H2I1M1N2O1R2T1

Reputation: 193517

Here's a possible base approach:

mydf[with(mydf, ave(-score, name, FUN = order)) %in% c(1, 2), ]
#    name score
# 1 marry    98
# 4 marry    96
# 5  mark    99
# 7  mark    79
# 8  john    87
# 9  john    77

For the curious, on timings--here's a small test...

Two sample datasets, both 1M rows, two columns, but one with only 1000 possible values for "name" and the other with 10000 possible values.

set.seed(1)
df1 <- data.frame(
  name = sample(1000, 1000000, TRUE),
  score = sample(0:100, 1000000, TRUE)
)
df2 <- data.frame(
  name = sample(10000, 1000000, TRUE),
  score = sample(0:100, 1000000, TRUE)
)

The functions to benchmark--I'll try to add "dplyr" later after I reinstall it.

fun1 <- function(mydf) {
  mydf[with(mydf, ave(-score, name, FUN = order)) %in% c(1, 2), ]
}

fun2 <- function(mydf) {
  as.data.table(mydf)[order(-score), .SD[1:2], by=name]
}

fun3 <- function(mydf) {
  df <- as.data.table(mydf)
  setorder(df, -score)[, head(.SD, 2), by = name]
}

The benchmarking.

library(microbenchmark)
microbenchmark(fun1(df1), fun2(df1), fun3(df1), 
               fun1(df2), fun2(df2), fun3(df2), times = 20)
# Unit: milliseconds
#       expr        min         lq       mean     median         uq       max neval
#  fun1(df1)  502.76809  513.98317  569.47883  597.90488  603.34458  686.4302    20
#  fun2(df1)  733.12544  741.18777  796.67106  822.60824  828.88449  839.3837    20
#  fun3(df1)   87.80581   93.07012   95.34281   95.56374   97.49608  101.7991    20
#  fun1(df2)  672.60241  764.10237  764.60365  772.33959  780.14679  799.3505    20
#  fun2(df2) 6338.14881 6360.42621 6407.66675 6412.99278 6451.75626 6479.2681    20
#  fun3(df2)  354.24119  366.47396  382.58666  369.78597  374.01897  468.9197    20

Upvotes: 6

David Arenburg

Reputation: 92282

Here's another data.table approach using the new setorder function (order by reference)

library(data.table) # 1.9.4+
setorder(setDT(df), -score)[, head(.SD, 2), by = name]
#     name score
# 1:  mark    99
# 2:  mark    79
# 3: marry    98
# 4: marry    96
# 5:  john    87
# 6:  john    77

Upvotes: 5

Rich Scriven

Reputation: 99331

It's a different output, but how about this so that the names aren't repeated.

sapply(split(df$score, df$name), function(x) tail(sort(x), 2))
#      john mark marry
# [1,]   77   79    96
# [2,]   87   99    98

As suggested by Ananda Mahto, you could also use stack with lapply

stack(lapply(split(df$score, df$name), function(x) tail(sort(x), 2)))
#   values   ind
# 1     77  john
# 2     87  john
# 3     79  mark
# 4     99  mark
# 5     96 marry
# 6     98 marry

Upvotes: 3

R, select rows according to the rank of a certain column

Answers (4)

data

Benchmarks

Related Questions