Anand
Anand

Reputation: 3760

faster method for ordered column names dataframe from numeric dataframe in R

I have a dataframe with three columns:

set.seed(123)
df <- data.frame(x = abs(rnorm(10)), y = abs(rnorm(10)), z = abs(rnorm(10)))
df
            x         y         z
1  0.56047565 1.2240818 1.0678237
2  0.23017749 0.3598138 0.2179749
3  1.55870831 0.4007715 1.0260044
4  0.07050839 0.1106827 0.7288912
5  0.12928774 0.5558411 0.6250393
6  1.71506499 1.7869131 1.6866933
7  0.46091621 0.4978505 0.8377870
8  1.26506123 1.9666172 0.1533731
9  0.68685285 0.7013559 1.1381369
10 0.44566197 0.4727914 1.2538149

I want to construct a dataframe with the same number of rows, having in each row, the column names of df, ordered by the corresponding row value in df. I have a for-loop based approach that works, but is too slow for a large dataframe, but am looking for a faster, vectorized approach. Here is the for loop based approach:

df_names <- df
df_names[,] <- NA
df_names
    x  y  z
1  NA NA NA
2  NA NA NA
3  NA NA NA
4  NA NA NA
5  NA NA NA
6  NA NA NA
7  NA NA NA
8  NA NA NA
9  NA NA NA
10 NA NA NA
for(r in 1:nrow(df)) {    
     sorted_row <- sort(df[r,], decreasing  = TRUE)
     df_names[r,] <- colnames(sorted_row)
}
df_names
   x y z
1  y z x
2  y x z
3  x z y
4  z y x
5  z y x
6  y x z
7  z y x
8  y x z
9  z y x
10 z y x

How do I do this faster using the apply family or vectorization?

Upvotes: 3

Views: 140

Answers (2)

Anand
Anand

Reputation: 3760

Revised: I merged all attempts, corrections by @rawr, and @rawr's approach is the best so far - with a 30x savings. @989 added a much faster approach. See accepted answer by @989.

library(microbenchmark)
set.seed(123)
df <- data.frame(x = abs(rnorm(1000)), y = abs(rnorm(1000)), z = abs(rnorm(1000)))

get_name_df_with_for = function(df) {
    df_names <- df
    df_names[,] <- NA
    for(r in 1:nrow(df)) {    
        df_names[r,] <- colnames(sort(df[r,], decreasing  = TRUE))
    }
    return(df_names)
}

get_name_df_with_apply = function(df) {
    df_names <- data.frame(t(apply(df, 1, function(row) names(sort(row, decreasing = TRUE)))))
    return(df_names)
}

get_name_df_with_apply_names = function(df) {
    df_names <- data.frame(t(apply(df, 1, function(row) names(row)[(order(row, decreasing = TRUE))])))
    return(df_names)
}


get_name_df_double_t = function(df) {
    df_names <- data.frame(t(apply(t(df), 2, function(col) names(sort(col, decreasing = TRUE)))))
    return(df_names)
}

microbenchmark(
    "for" = get_name_df_with_for(df),
    "double_transpose" = get_name_df_double_t(df),
    "apply" = get_name_df_with_apply(df),
    "apply_with_names" = get_name_df_with_apply_names(df),   
    times = 10
)
Unit: milliseconds
             expr       min        lq      mean    median        uq       max neval
              for 417.08341 424.37019 446.00655 451.67451 459.64900 480.33351    10
 double_transpose  28.46577  29.96637  32.44685  33.02763  33.51309  36.77468    10
            apply  27.54800  28.27331  38.02239  30.36667  37.29727  71.46596    10
 apply_with_names  12.35264  12.59502  14.16868  13.92946  15.80656  17.22005    10

Upvotes: 2

989
989

Reputation: 12937

If the number of columns in your df is just three, here is a faster solution using max.col. It is provably about 8x faster than the fastest solution proposed in the other answer when nrow(df)=100.

The case in which nrow(df)=100

library(microbenchmark)
set.seed(123)
size <- 100
df <- data.frame(x = abs(rnorm(size)), y = abs(rnorm(size)), z = abs(rnorm(size)))  

f1 <- function(df){
    vec <- unlist(t(df))
    sq <- seq(0,(nrow(df)-1)*3,3)
    m1 <- max.col(df)
    # -----------------------
    vec[sq+m1] <- -Inf
    m2 <- max.col(matrix(vec, ncol=3, byrow=T))
    vec[sq+m2] <- -Inf
    # -----------------------
    m3 <- max.col(matrix(vec, ncol=3, byrow=T))
    nm <- names(df)
    cbind(nm[m1], nm[m2], nm[m3])
}

all(f1(df)==get_name_df_with_for(df))
# [1] TRUE
all(f1(df)==get_name_df_with_apply(df))
# [1] TRUE
all(f1(df)==get_name_df_with_apply_names(df))
# [1] TRUE
all(f1(df)==get_name_df_double_t(df))
# [1] TRUE
microbenchmark(f1(df), "f2"=get_name_df_with_for(df), "f3"=get_name_df_with_apply(df), 
            "f4"=get_name_df_with_apply_names(df), "f5"=get_name_df_double_t(df))

# Unit: microseconds
   # expr       min         lq       mean    median         uq       max neval
 # f1(df)   395.643   458.0905   470.8278   472.633   492.7355   701.464   100
     # f2 59262.146 61773.0865 63098.5840 62963.223 64309.4780 74246.953   100
     # f3  5491.521  5637.1605  6754.3912  5801.619  5956.4545 90457.611   100
     # f4  3392.689  3463.9055  3603.1546  3569.125  3707.2795  4237.012   100
     # f5  5513.335  5636.3045  5954.9277  5781.089  5971.2115  8622.017   100

Significantly faster when nrow(df)=1000

# Unit: microseconds
   # expr        min          lq        mean      median          uq        max neval
 # f1(df)    693.765    769.8995    878.3698    815.6655    846.4615   3559.929   100
     # f2 627876.429 646057.8155 671925.4799 657768.6270 694047.9940 797900.142   100
     # f3  49570.397  52038.3515  54334.0501  53838.8465  56181.0515  62517.965   100
     # f4  28892.611  30046.8180  31961.4085  31262.4040  33057.5525  48694.850   100
     # f5  49866.379  51491.7235  54413.8287  53705.3970  55962.0575  75287.600   100

Upvotes: 1

Related Questions