stats_noob
stats_noob

Reputation: 5925

Converting Multiple DPLYR Commands into a Single DPLYR Command

I am working with the R programming language.

I have the following dataset ("my_data"):

structure(list(idd = 1:50, group_1 = c("B", "B", "A", "B", "B", 
"A", "A", "A", "B", "A", "A", "B", "B", "B", "A", "A", "A", "A", 
"B", "B", "A", "B", "A", "B", "A", "B", "B", "A", "B", "B", "B", 
"A", "B", "A", "B", "B", "A", "A", "A", "A", "A", "B", "B", "B", 
"A", "B", "B", "B", "B", "B"), v1 = c(15.7296737049317, -4.33377704672207, 
-0.551850185265, 2.66888122578048, 12.109072642513, 0.0107927293899017, 
20.7785032320562, -1.98974382507874, 12.1663703518471, 11.4308702978893, 
-0.657500910529805, 5.71376589298221, 3.43820523228653, 19.5939432685761, 
25.5605263610222, -0.407964337882465, 19.3057240854025, 9.24554068987809, 
-9.6719534905096, 2.44096357354807, 14.6114916050676, 11.4510663104787, 
-14.4231132108142, 15.8031868545157, 16.5505199848675, 6.95491162740581, 
2.92431767382703, 29.7157201447823, 9.10001319352251, 9.85982748068076, 
-1.23456937110154, -3.44130123376206, -5.23155771062088, 5.78031789617826, 
23.6092446408098, 27.5379484533487, 25.6836473435279, 22.9675556994775, 
7.62403748556388, -2.24150135680706, 6.72187319859928, -14.1245027627225, 
6.8620712655661, 26.5987870464572, 11.3095310060752, 20.9588868268958, 
14.8934095694391, 2.21089704551347, 27.4355935292935, 9.21612714668934
), group_2 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L)), row.names = c(NA, -50L), class = "data.frame")

head(my_data)

   idd group_1          v1 group_2
1    1       B 15.72967370       1
2    2       B -4.33377705       2
3    3       A -0.55185019       3
4    4       B  2.66888123       4
5    5       B 12.10907264       5
6    6       A  0.01079273       6
7    7       A 20.77850323       7
8    8       A -1.98974383       8
9    9       B 12.16637035       9
10  10       A 11.43087030      10
11  11       A -0.65750091       1
12  12       B  5.71376589       2

For this dataset, I want to perform the following steps in "dplyr":

I know how to do this manually in R:

#STEP 1: since my_data has 50 rows, break my_data into 5 groups of 10 rows

rows_1 = my_data[1:10,]
rows_2 = my_data[11:20,]
rows_3 = my_data[21:30,]
rows_4 = my_data[31:40,]
rows_5 = my_data[41:50,]

# STEP 2: find out values of "v2"

library(dplyr)

dplyr_row_1 = data.frame(rows_1 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_1$v2 = ifelse(dplyr_row_1[1,2] > dplyr_row_1[2,2], "A", ifelse(dplyr_row_1[1,2] < dplyr_row_1[2,2], "B", 0))

dplyr_row_2 = data.frame(rows_2 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_2$v2 = ifelse(dplyr_row_2[1,2] > dplyr_row_2[2,2], "A", ifelse(dplyr_row_2[1,2] < dplyr_row_2[2,2], "B", 0))

dplyr_row_3 = data.frame(rows_3 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_3$v2 = ifelse(dplyr_row_3[1,2] > dplyr_row_3[2,2], "A", ifelse(dplyr_row_3[1,2] < dplyr_row_3[2,2], "B", 0))

dplyr_row_4 = data.frame(rows_4 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_4$v2 = ifelse(dplyr_row_4[1,2] > dplyr_row_4[2,2], "A", ifelse(dplyr_row_4[1,2] < dplyr_row_4[2,2], "B", 0))

dplyr_row_5 = data.frame(rows_5 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_5$v2 = ifelse(dplyr_row_5[1,2] > dplyr_row_5[2,2], "A", ifelse(dplyr_row_5[1,2] < dplyr_row_5[2,2], "B", 0))

# STEP 3: append "v2" to first 5 files:

rows_1$v2 = dplyr_row_1$v2
rows_2$v2 = dplyr_row_2$v2
rows_3$v2 = dplyr_row_3$v2
rows_4$v2 = dplyr_row_4$v2
rows_5$v2 = dplyr_row_5$v2

# STEP 4: create final file:

final_file = rbind(rows_1,rows_2, rows_3, rows_4, rows_5)

As a result, the final file looks something like this:

  idd group_1          v1 group_2 v2
1    1       B 15.72967370       1  B
2    2       B -4.33377705       2  B
3    3       A -0.55185019       3  B
4    4       B  2.66888123       4  B
5    5       B 12.10907264       5  B
6    6       A  0.01079273       6  B
7    7       A 20.77850323       7  B
8    8       A -1.98974383       8  B
9    9       B 12.16637035       9  B
10  10       A 11.43087030      10  B
11  11       A -0.65750091       1  A

My Question: Can someone please show me how to perform Steps 1 to Step 4 in a single "dplyr" command?

Thanks!

Upvotes: 0

Views: 289

Answers (2)

Cettt
Cettt

Reputation: 11981

Here is alternative method.

library(tidyverse)

df %>% 
  mutate(group_index = rep(1:(n() /10), each = 10)) %>% 
  group_by(group_index) %>%
  mutate(
   v2 = case_when(
    sum(v1[group_1 == 'A']) > sum(v1[group_1 == 'B']) ~ 'A', 
    sum(v1[group_1 == 'A']) < sum(v1[group_1 == 'B']) ~ 'B', 
    TRUE ~'0')
) 

Upvotes: 1

benson23
benson23

Reputation: 19107

  1. First I'll create a group_index to group every 10 rows together.
  2. Then group_by the relevant columns and calculate sum.
  3. Remove the grouping layer of group_1, since we need to compare the values in A and B.
  4. If the unique length of sum is equal to "1", that means they are the same, then input "0" in column v2. If they are not the same, output the maximum category stored in group_1.
  5. Finally remove the sum column and sort by idd.

This method is able to solve problem with more than two groups in group_1.

The first 20 rows are shown here for example.

library(tidyverse)

df %>% 
  mutate(group_index = rep(1:(nrow(df)/10), each = 10)) %>% 
  group_by(group_index, group_1) %>% 
  mutate(sum = sum(v1)) %>% 
  group_by(group_index) %>%
  mutate(v2 = ifelse(length(unique(sum)) == 1, 0, group_1[which.max(sum)])) %>%
  ungroup() %>% 
  select(-c(sum, group_index))

# A tibble: 20 x 5
     idd group_1      v1 group_2 v2   
   <int> <chr>     <dbl>   <int> <chr>
 1     1 B       15.7          1 B    
 2     2 B       -4.33         2 B    
 3     3 A       -0.552        3 B    
 4     4 B        2.67         4 B    
 5     5 B       12.1          5 B    
 6     6 A        0.0108       6 B    
 7     7 A       20.8          7 B    
 8     8 A       -1.99         8 B    
 9     9 B       12.2          9 B    
10    10 A       11.4         10 B    
11    11 A       -0.658        1 A    
12    12 B        5.71         2 A    
13    13 B        3.44         3 A    
14    14 B       19.6          4 A    
15    15 A       25.6          5 A    
16    16 A       -0.408        6 A    
17    17 A       19.3          7 A    
18    18 A        9.25         8 A    
19    19 B       -9.67         9 A    
20    20 B        2.44        10 A    

Upvotes: 1

Related Questions