Reputation: 259

Replace each row based on the string match and column value

I have a dataset like this

dt <- data.table(Score = c(0.33,0.34,00.3, -0.22, 0.232), 
                 Id2 = c("0/0","0/1","1/0","0/0","0/0"), 
                 Kps = c("0/1","0/0","1/1","0/1","0/0"), 
                 Inr = c("0/0","0/1","1/1","0/0","0/1"))

I need to replace the values of each row based on the Score column as like this

If "0/0" or "1/1" then Score * 2
If "1/0" or "0/1" then Score

Usually, it can be done by using the base function like this

dt$Id2 <- dt$Score * 2

But here I have to consider each row and I have around 1000 columns so it can be only done with loop

The expected output

Score  Id2    Kps    Inr 
0.330  0.66   0.330  0.66
0.340  0.340  0.68  0.340
0.300  0.300  0.6   0.6
-0.220 -0.44 -0.22 -0.44
0.232  0.464 0.464  0.232

Any suggestions?

Upvotes: 4

Answers (4)

zx8754

Reputation: 56259

Using matrix multiplication:

# like @akrun using a named vector for conversion, to avoid ifelse/case/switch:
keyval <- setNames(c(2, 2, 1, 1), c("0/0", "1/1", "1/0", "0/1"))

#convert and make the matrix, then multiply
matrix(keyval[ as.matrix(dt[, -1 ]) ] * dt[[ 1 ]], ncol = ncol(dt) - 1)
#        [,1]   [,2]   [,3]
# [1,]  0.660  0.330  0.660
# [2,]  0.340  0.680  0.340
# [3,]  0.300  0.600  0.600
# [4,] -0.440 -0.220 -0.440
# [5,]  0.464  0.464  0.232

Benchmark using a bigger dataset:

library(dplyr)
library(tidyr)

#bigger data
n = 1000
set.seed(1); dt <- data.table(cbind(
  Score = runif(n),
  data.frame(matrix(sample(c("0/0", "0/1", "1/0", "1/1"), n * n, replace = TRUE), ncol = n))))

Matrix multiplication should give 3-7x improvement compared to dplyr:

m <- microbenchmark::microbenchmark(
  matrix = {
    matrix(keyval[ as.matrix(dt[, -1 ]) ] * dt[[ 1 ]], ncol = ncol(dt) - 1)
  },
  dplyr1 = {
    dt |> 
      pivot_longer(-Score) |> 
      mutate(value = case_when(
        value == '0/0' | value == "1/1" ~ Score *2,
        value == '1/0' | value == "0/1" ~ Score 
      )) |> 
      pivot_wider(names_from = name, values_from = value)
  },
  dplyr2 = {
    dt %>%
      mutate(across(-Score, ~ Score * (.x %in% c("0/0", "1/1") + 1)))
  })

print(m, unit = "relative")
# Unit: relative
#    expr      min       lq     mean   median       uq      max neval
#  matrix 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100
#  dplyr1 7.697692 8.468686 7.279598 8.071069 7.652855 3.423847   100
#  dplyr2 3.862794 3.708899 3.399736 3.560082 3.687698 2.096620   100

Upvotes: 0

akrun

Reputation: 887951

As the input is data.table, here is one approach with data.table

library(data.table)
 dt[, (names(dt)[-1]) := lapply(.SD, \(x)
    fcase(x %chin% c("0/0", "1/1"), Score *2,
    x %chin% c("1/0", "0/1"), Score)), .SDcols = -1]

-output

> dt
    Score    Id2    Kps    Inr
1:  0.330  0.660  0.330  0.660
2:  0.340  0.340  0.680  0.340
3:  0.300  0.300  0.600  0.600
4: -0.220 -0.440 -0.220 -0.440
5:  0.232  0.464  0.464  0.232

Or another option is to make use of named vector

keyval <- setNames(c(2, 2, 1, 1), c("0/0", "1/1", "1/0", "0/1"))
dt[, (names(dt)[-1]) := lapply(.SD, \(x) Score *keyval[x]), .SDcols = -1]

-output

> dt
    Score    Id2    Kps    Inr
1:  0.330  0.660  0.330  0.660
2:  0.340  0.340  0.680  0.340
3:  0.300  0.300  0.600  0.600
4: -0.220 -0.440 -0.220 -0.440
5:  0.232  0.464  0.464  0.232

Or create a count of 1s and 0s to multiply

library(stringr)
dt[, (names(dt)[-1]) := lapply(.SD, \(x) Score * 1 + 
   (str_count(x, "0")!= 1)) , .SDcols = -1]
> dt
    Score   Id2    Kps   Inr
1:  0.330 1.330  0.330 1.330
2:  0.340 0.340  1.340 0.340
3:  0.300 0.300  1.300 1.300
4: -0.220 0.780 -0.220 0.780
5:  0.232 1.232  1.232 0.232

Upvotes: 3

Darren Tsai

Reputation: 35649

With dplyr::across(), you can apply a function across multiple columns. It supports tidy selections so that you can cleverly select variables based on their names or properties.

library(dplyr)

dt %>%
  mutate(across(-Score, ~ ifelse(.x %in% c("0/0", "1/1"), Score * 2, Score)))

#     Score    Id2    Kps    Inr
# 1:  0.330  0.660  0.330  0.660
# 2:  0.340  0.340  0.680  0.340
# 3:  0.300  0.300  0.600  0.600
# 4: -0.220 -0.440 -0.220 -0.440
# 5:  0.232  0.464  0.464  0.232

A tricky way

dt %>%
  mutate(across(-Score, ~ Score * (.x %in% c("0/0", "1/1") + 1)))

Upvotes: 2

MarBlo

Reputation: 4534

Here is a tidyverse -way solution. It uses a data.frame and makes it longer in a first step. Then with case_when the different conditions were implemented.

pivot_wider brought it back to a wider format.

library(tidyverse)

dt<- data.frame(Score = c(0.33,0.34,00.3, -0.22, 0.232), 
                Id2=c("0/0","0/1","1/0","0/0","0/0"), 
                Kps=c("0/1","0/0","1/1","0/1","0/0"), 
                Inr=c("0/0","0/1","1/1","0/0","0/1"))


dt |> 
  pivot_longer(-Score) |> 
  mutate(value = case_when(
    value == '0/0' | value == "1/1" ~ Score *2,
    value == '1/0' | value == "0/1" ~ Score 
  )) |> 
  pivot_wider(names_from = name, values_from = value)
#> # A tibble: 5 × 4
#>    Score    Id2    Kps    Inr
#>    <dbl>  <dbl>  <dbl>  <dbl>
#> 1  0.33   0.66   0.33   0.66 
#> 2  0.34   0.34   0.68   0.34 
#> 3  0.3    0.3    0.6    0.6  
#> 4 -0.22  -0.44  -0.22  -0.44 
#> 5  0.232  0.464  0.464  0.232

Upvotes: 2

Replace each row based on the string match and column value

Answers (4)

Related Questions