stats_noob
stats_noob

Reputation: 5897

R: Randomly Replacing Elements of a Data Frame with 0

I am working with the R programming language.

Suppose I have the following data frame:

var_1 = var_2 = var_3 = var_4 = var_5 =  c("1,2,3,4,5,6,7,8,9,10")

my_data = data.frame(var_1,var_2,var_3,var_4,var_5)

my_data = rbind(my_data, my_data[rep(1, 100), ])

rownames(my_data) = 1:nrow(my_data)

The data looks like this:

    head(my_data)

                 var_1                var_2                var_3                var_4                var_5
1 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10
2 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10
3 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10
4 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10
5 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10
6 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10

My Question: I would like to randomly replace elements in this data frame with 0 - for instance, the final result should look something like this (for brevity, I am only showing the first row):

# desired result

                 var_1                var_2                var_3                var_4                var_5
1 1,0,3,0,5,6,0,0,9,10 1,2,0,4,5,0,0,8,9,0 1,0,3,0,0,0,0,8,9,0 1,2,3,4,0,6,7,0,0,10 1,2,0,4,5,0,7,8,0,10

I tried to do this with the following line of code (Replace random values in a column in a dataframe) :

my_data$var_1[sample(nrow(my_data),as.integer(0.5*nrow(my_data)) , replace = TRUE)] <- 0
my_data$var_2[sample(nrow(my_data),as.integer(0.5*nrow(my_data)), replace = TRUE)] <- 0
my_data$var_3[sample(nrow(my_data),as.integer(0.5*nrow(my_data)), replace = TRUE)] <- 0
my_data$var_4[sample(nrow(my_data),as.integer(0.5*nrow(my_data)), replace = TRUE)] <- 0
my_data$var_5[sample(nrow(my_data),as.integer(0.5*nrow(my_data)), replace = TRUE)] <- 0

But this is replacing ALL the elements of a row with 0 (instead of just replacing some of the elements within a row):

head(my_data)
                 var_1                var_2                var_3                var_4                var_5
1 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10                    0                    0                    0
2                    0                    0 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10                    0
3                    0 1,2,3,4,5,6,7,8,9,10                    0 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10
4                    0 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10                    0 1,2,3,4,5,6,7,8,9,10
5 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10
6 1,2,3,4,5,6,7,8,9,10                    0 1,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,8,9,10                    0

Can someone please show me what I am doing wrong and how to get the desired result?

Thanks!

Upvotes: 2

Views: 379

Answers (3)

jay.sf
jay.sf

Reputation: 72593

Here's a version that allows you to specify a vector of probabilities pnul of becoming 0 in every column separately using Map. length of splitted string is being multiplied by the elements of pnul to get number of samples set to zero. You may also set pnul to a scalar for same probability in all columns.

pnul <- c(.0, .2, .5, .8, 1)

res <- Map(\(x, a) {
  S <- strsplit(x, ',')
  sapply(S, \(s) {
    s[sample(seq_along(s), length(s)*a)] <- '0'
    paste(s, collapse=',')
  })
}, my_data, pnul) |> as.data.frame()

head(res)
#                  var_1                var_2                var_3                var_4               var_5
# 1 1,2,3,4,5,6,7,8,9,10 0,0,3,4,5,6,7,8,9,10  1,2,0,4,0,0,7,8,0,0  0,0,0,0,0,0,0,8,9,0 0,0,0,0,0,0,0,0,0,0
# 2 1,2,3,4,5,6,7,8,9,10  1,0,3,4,5,6,7,8,9,0 1,0,3,0,5,0,0,0,9,10  0,0,0,0,0,0,7,8,0,0 0,0,0,0,0,0,0,0,0,0
# 3 1,2,3,4,5,6,7,8,9,10 1,0,0,4,5,6,7,8,9,10 1,0,0,0,0,6,7,0,9,10 0,0,0,0,5,0,0,0,0,10 0,0,0,0,0,0,0,0,0,0
# 4 1,2,3,4,5,6,7,8,9,10 1,2,3,0,5,6,7,0,9,10 0,0,3,0,5,0,7,0,9,10  0,0,0,4,0,0,7,0,0,0 0,0,0,0,0,0,0,0,0,0
# 5 1,2,3,4,5,6,7,8,9,10  1,0,3,4,5,6,7,8,9,0 0,2,0,4,5,0,7,0,0,10  1,0,0,0,0,0,0,8,0,0 0,0,0,0,0,0,0,0,0,0
# 6 1,2,3,4,5,6,7,8,9,10 0,2,3,4,5,6,0,8,9,10  1,2,3,0,5,0,7,0,0,0  0,0,0,4,5,0,0,0,0,0 0,0,0,0,0,0,0,0,0,0

Upvotes: 2

TarJae
TarJae

Reputation: 78917

This could be one way to get to your aim:

In you example each row of a column is regarded as a one element vector therefore when you replace it all 1:10 are replaced by 0.

To avoid that one possible way is to separate this vector in rows with separate_row from tidyr package and after doing our calculation, bring it back with toString:

library(dplyr)
library(tidyr)

my_data %>% 
  separate_rows(starts_with("var")) %>% 
  group_by(id_Group =cumsum(var_1==1)) %>% 
  mutate(across(starts_with("var"), ~replace(., sample(row_number(),  
                                        size = ceiling(0.4 * n()), replace = TRUE), 0))) %>% 
  mutate(across(starts_with("var"), ~toString(.))) %>% 
  slice(1) %>% 
  ungroup() %>% 
  select(-id_Group)
  var_1                         var_2                         var_3     var_4 var_5
   <chr>                         <chr>                         <chr>     <chr> <chr>
 1 1, 2, 0, 4, 0, 6, 7, 0, 9, 10 1, 2, 3, 0, 5, 0, 0, 8, 9, 0  0, 0, 3,~ 1, 2~ 0, 0~
 2 1, 2, 3, 4, 0, 6, 0, 8, 9, 0  1, 2, 3, 0, 0, 6, 7, 8, 9, 0  1, 2, 3,~ 1, 0~ 1, 0~
 3 1, 0, 3, 4, 5, 0, 7, 8, 9, 0  1, 2, 3, 0, 5, 0, 0, 0, 9, 10 1, 2, 0,~ 1, 2~ 1, 2~
 4 1, 0, 0, 4, 5, 0, 7, 8, 9, 0  1, 2, 3, 4, 5, 0, 7, 0, 0, 10 0, 0, 0,~ 0, 0~ 0, 0~
 5 0, 0, 3, 4, 0, 6, 7, 8, 9, 10 0, 0, 3, 4, 5, 6, 0, 8, 9, 10 1, 0, 3,~ 1, 2~ 1, 2~
 6 1, 0, 0, 4, 5, 6, 0, 8, 9, 10 1, 2, 3, 0, 0, 6, 7, 8, 9, 0  1, 0, 3,~ 1, 2~ 1, 2~
 7 1, 0, 3, 4, 5, 0, 0, 8, 0, 10 0, 2, 3, 0, 5, 6, 7, 8, 0, 0  1, 2, 3,~ 1, 2~ 1, 0~
 8 1, 2, 0, 4, 0, 0, 7, 8, 9, 10 1, 2, 3, 0, 5, 0, 7, 8, 0, 0  1, 0, 3,~ 1, 0~ 0, 2~
 9 1, 2, 3, 0, 5, 0, 0, 0, 9, 10 1, 2, 3, 4, 0, 0, 0, 8, 9, 0  1, 2, 0,~ 0, 0~ 0, 0~
10 1, 2, 3, 0, 5, 6, 7, 8, 9, 0  1, 2, 0, 4, 5, 6, 7, 8, 0, 10 1, 0, 3,~ 1, 2~ 1, 2~
# ... with 91 more rows

Upvotes: 2

Rui Barradas
Rui Barradas

Reputation: 76402

Here is a way.
Create a function to sample n elements from each vector element, defaulting to 1, and lapply the function to the vectors to be changed. In the posted example all column vectors will be changed.

options(width=205)

var_1 = var_2 = var_3 = var_4 = var_5 =  c("1,2,3,4,5,6,7,8,9,10")
my_data = data.frame(var_1,var_2,var_3,var_4,var_5)
my_data = rbind(my_data, my_data[rep(1, 100), ])
rownames(my_data) = 1:nrow(my_data)

my_data1 <- my_data

randReplace <- function(x, n = 1L, split = ",") {
  y <- strsplit(x, split = split)
  z <- lapply(y, function(.y) {
    i <- seq_along(.y)
    m <- min(n, length(.y))
    .y[sample(i, m)] <- 0
    .y
  })
  z <- lapply(z, paste, collapse = split)
  unlist(z)
}

my_data[] <- lapply(my_data, randReplace)
head(my_data)
#>                  var_1                var_2                var_3                var_4                var_5
#> 1 1,2,3,4,5,0,7,8,9,10 0,2,3,4,5,6,7,8,9,10 1,2,3,4,5,0,7,8,9,10 1,2,3,4,0,6,7,8,9,10 1,2,3,0,5,6,7,8,9,10
#> 2 1,0,3,4,5,6,7,8,9,10 1,2,0,4,5,6,7,8,9,10 1,2,3,0,5,6,7,8,9,10 1,2,3,4,5,0,7,8,9,10 0,2,3,4,5,6,7,8,9,10
#> 3 1,2,3,4,0,6,7,8,9,10 1,2,3,4,5,6,7,0,9,10 0,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,0,8,9,10 1,2,3,4,0,6,7,8,9,10
#> 4 1,2,0,4,5,6,7,8,9,10 1,2,3,4,0,6,7,8,9,10 1,2,3,4,5,6,7,0,9,10 1,2,3,4,5,6,0,8,9,10 1,0,3,4,5,6,7,8,9,10
#> 5 1,2,3,0,5,6,7,8,9,10 1,2,3,4,5,6,7,8,0,10 1,2,3,0,5,6,7,8,9,10 0,2,3,4,5,6,7,8,9,10 1,2,3,4,5,6,7,0,9,10
#> 6 1,2,3,4,5,6,0,8,9,10  1,2,3,4,5,6,7,8,9,0 1,2,3,0,5,6,7,8,9,10  1,2,3,4,5,6,7,8,9,0 1,0,3,4,5,6,7,8,9,10

my_data1[] <- lapply(my_data1, randReplace, n = 4)
head(my_data1)
#>                  var_1                var_2                var_3                var_4                var_5
#> 1  1,2,3,4,0,0,0,8,9,0  0,2,0,0,5,6,7,8,9,0 1,0,0,4,0,6,0,8,9,10 0,0,3,4,0,0,7,8,9,10  1,0,3,4,5,6,0,8,0,0
#> 2  1,0,3,4,5,6,0,8,0,0  1,0,3,0,0,6,7,8,9,0 0,2,3,4,5,0,0,8,0,10 1,0,0,4,5,0,7,8,0,10  0,0,3,4,0,6,7,8,9,0
#> 3  0,2,3,4,5,6,7,0,0,0 0,2,0,0,5,6,7,8,0,10  1,2,0,4,5,6,0,0,9,0 0,2,0,0,5,0,7,8,9,10  0,2,0,4,5,0,7,8,9,0
#> 4 1,2,3,0,0,6,0,8,0,10  1,0,3,4,5,0,7,8,0,0 1,2,0,0,5,0,7,0,9,10  1,0,3,0,5,6,0,8,9,0 0,2,0,4,0,6,0,8,9,10
#> 5  1,2,3,4,0,0,7,0,9,0 0,0,0,0,5,6,7,8,9,10 0,0,3,4,5,6,0,0,9,10 0,2,3,4,0,0,7,0,9,10 1,0,0,0,5,6,7,8,0,10
#> 6 1,2,0,4,0,0,0,8,9,10 0,2,3,0,0,0,7,8,9,10 0,2,0,4,0,6,7,8,0,10 1,2,3,4,5,0,0,0,0,10  0,2,0,4,0,6,7,8,9,0

Created on 2022-04-10 by the reprex package (v2.0.1)

Upvotes: 2

Related Questions