Ajrhjnd
Ajrhjnd

Reputation: 330

Adding Dummie rows to your data in R

I am working with R. I have a data set that looks like this.

structure(
  list(
    category = c(
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "2",
      "2",
      "2",
      "2"
    ),
    Cue = c(
      "Kind",
      "love",
      "acount",
      "bright",
      "smart",
      "land",
      "flag",
      "affect",
      "place",
      "street",
      "sun",
      "cold"
    ),
    categoryProduced = c(
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1"
    ),
    mean = c(3, 1, 1, 1, 2, 4, 4,
             4, 4, 2, 4, 2)
  ),
  row.names = c(NA, -12L),
  groups = structure(
    list(
      category = c(
        "1",
        "1",
        "1",
        "1",
        "1",
        "1",
        "1",
        "1",
        "2",
        "2",
        "2",
        "2"
      ),
      Cue = c(
        "Kind",
        "love",
        "acount",
        "bright",
        "smart",
        "land",
        "flag",
        "affect",
        "place",
        "street",
        "sun",
        "cold"
      ),
      .rows = structure(
        list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
             9L, 10L, 11L, 12L),
        ptype = integer(0),
        class = c("vctrs_list_of",
                  "vctrs_vctr", "list")
      )
    ),
    row.names = c(NA, 12L),
    class = c("tbl_df",
              "tbl", "data.frame"),
    .drop = TRUE
  ),
  class = c("grouped_df",
            "tbl_df", "tbl", "data.frame")
)

I need to reach 20 observations for each category. Right now, I have 8 observations in the first category and 4 in the second category. Son one needs 12 and the other one 16.

I need the resulting data to look like this..

category    Cue    categoryProduced  mean
1 1         Kind      1                    3
2 1        love       1                    1
3 1        acount     1                    1
4 1        bright     1                    1
5 1        smart      1                    2
6 1        land       1                    4
7 1        flag       1                    4
8 1        affect     1                    4
9 1          NA       0                    0
101          NA       0                    0
11 1         NA       0                    0 
12 2        place     1                    4
13 2        street    1                    2
14 2        sun       1                    4
15 2        cold      1                    2
16 2          NA      0                    0 
17 2          NA      0                    0

That until I have 20 observations for each category. What is written under Cue could be anything. What I sure need is a 0 under category produced and mean.

Any help in this would great. Thanks.

Upvotes: 1

Views: 41

Answers (2)

user2974951
user2974951

Reputation: 10375

Group by category, if nrow<20 fill with a computed data frame with default values, else return the data as is.

do.call(
  rbind,
  by(df,list(df$category),function(x){
    tmp=20-nrow(x)
    if (tmp>0) {
      rbind(
        x,
        setNames(
          data.frame(
            x$category[1],
            NA,
            matrix(0,tmp,ncol(x)-2)
          ),
          colnames(x)
        )
      )
    } else {
      x
    }
  })
)

all this for

      category    Cue categoryProduced mean
1.1          1   Kind                1    3
1.2          1   love                1    1
1.3          1 acount                1    1
1.4          1 bright                1    1
1.5          1  smart                1    2
1.6          1   land                1    4
1.7          1   flag                1    4
1.8          1 affect                1    4
1.9          1   <NA>                0    0
1.10         1   <NA>                0    0
1.11         1   <NA>                0    0
1.12         1   <NA>                0    0
1.13         1   <NA>                0    0
1.14         1   <NA>                0    0
1.15         1   <NA>                0    0
1.16         1   <NA>                0    0
1.17         1   <NA>                0    0
1.18         1   <NA>                0    0
1.19         1   <NA>                0    0
1.20         1   <NA>                0    0
2.9          2  place                1    4
2.10         2 street                1    2
2.11         2    sun                1    4
2.12         2   cold                1    2
2.1          2   <NA>                0    0
2.2          2   <NA>                0    0
2.3          2   <NA>                0    0
2.4          2   <NA>                0    0
2.5          2   <NA>                0    0
2.6          2   <NA>                0    0
2.7          2   <NA>                0    0
2.8          2   <NA>                0    0
2.91         2   <NA>                0    0
2.101        2   <NA>                0    0
2.111        2   <NA>                0    0
2.121        2   <NA>                0    0
2.13         2   <NA>                0    0
2.14         2   <NA>                0    0
2.15         2   <NA>                0    0
2.16         2   <NA>                0    0

Upvotes: 1

Ronak Shah
Ronak Shah

Reputation: 389135

With the help of tidyr::complete you can do

library(dplyr)
library(tidyr)

df %>%
  group_by(category) %>%
  mutate(count = row_number()) %>%
  complete(count = 1:20, fill = list(categoryProduced = 0, mean = 0)) %>%
  ungroup

# A tibble: 40 x 5
#   category count Cue    categoryProduced  mean
#   <chr>    <int> <chr>  <chr>            <dbl>
# 1 1            1 Kind   1                    3
# 2 1            2 love   1                    1
# 3 1            3 acount 1                    1
# 4 1            4 bright 1                    1
# 5 1            5 smart  1                    2
# 6 1            6 land   1                    4
# 7 1            7 flag   1                    4
# 8 1            8 affect 1                    4
# 9 1            9 NA     0                    0
#10 1           10 NA     0                    0
# … with 30 more rows

count column has count of rows for each category from 1 to 20. You can remove the count column with select(-count) if not needed in the final output.

Upvotes: 4

Related Questions