Reputation: 350
I have values dataframe consisting of character columns and a properties dataframe consisting of the column names of the values dataframe and a predefined data type. I need to check if the values data types are consistent with the predefined data type. Therefore I want to change the data types in the values dataframe based on the predefined data types in the properties dataframe in an automated way using map(), apply() or a for loop, preferably a tidy(verse) solution.
Below is an example but the actual dataset has a lot more columns/properties.
# Libraries
library(dplyr)
# Seed
set.seed(12345)
# Property example
properties <-
tibble(property = c("a", "b", "c"),
datatype = c("logical", "integer", "double"))
# Values example
values <-
tibble(a = sample(c("TRUE", "FALSE", "CORRECT"), 10, replace = TRUE),
b = as.character(sample(c(1:9, 1.4), 10)),
c = sample(c(-1.5:1.5, "word"), 10, replace = TRUE))
properties
## A tibble: 3 x 2
# property datatype
# <chr> <chr>
#1 a logical
#2 b integer
#3 c double
values
## A tibble: 10 x 3
# a b c
# <chr> <chr> <chr>
# 1 FALSE 7 -1.5
# 2 CORRECT 1 word
# 3 FALSE 8 1.5
# 4 FALSE 1.4 -0.5
# 5 TRUE 6 -0.5
# 6 CORRECT 9 1.5
# 7 FALSE 4 0.5
# 8 FALSE 2 1.5
# 9 CORRECT 3 -1.5
#10 FALSE 5 -1.5
Part the my check will be to compare the amount of NAs per column before and after the transformation. This creates another problem as the double values will be automatically transformed to integers, while this is should become NA because it is a faulty data type (column b, row 4).
# Prefered result
tibble(a = c(FALSE, NA, FALSE, FALSE, TRUE, NA, FALSE, FALSE, NA, FALSE),
b = c(7L, 1L, 8L, NA_integer_, 6L, 9L, 4L, 2L, 3L, 5L),
c = c(-1.5, NA, 1.5, -0.5, -0.5, 1.5, 0.5, 1.5, -1.5, -1.5))
## A tibble: 10 x 3
# a b c
# <lgl> <int> <dbl>
# 1 FALSE 7 -1.5
# 2 NA 1 NA
# 3 FALSE 8 1.5
# 4 FALSE NA -0.5
# 5 TRUE 6 -0.5
# 6 NA 9 1.5
# 7 FALSE 4 0.5
# 8 FALSE 2 1.5
# 9 NA 3 -1.5
#10 FALSE 5 -1.5
All help is much appreciated!
Upvotes: 1
Views: 278
Reputation: 1688
EDIT:
Add a dplyr
solution
properties <- mutate(properties,func_name =paste0("as.",datatype) )
values %>%
mutate_all(function(x) match.fun(properties$func_name[match(deparse(substitute(x)),properties$property)])(x))
#> Warning in (function (x) : NAs introduced by coercion
#> # A tibble: 10 x 3
#> a b c
#> <lgl> <int> <dbl>
#> 1 FALSE 7 -1.5
#> 2 NA 1 NA
#> 3 FALSE 8 1.5
#> 4 FALSE 1 -0.5
#> 5 TRUE 6 -0.5
#> 6 NA 9 1.5
#> 7 FALSE 4 0.5
#> 8 FALSE 2 1.5
#> 9 NA 3 -1.5
#> 10 FALSE 5 -1.5
Basically this can be done using match.fun(funcname)
# Libraries
library(dplyr)
library(data.table)
# Seed
set.seed(12345)
# Property example
properties <-
tibble(property = c("a", "b", "c"),
datatype = c("logical", "integer", "double"))
# datatype = c("boolean", "integer", "double"))
# Values example
values <-
tibble(a = sample(c("TRUE", "FALSE", "CORRECT"), 10, replace = TRUE),
b = as.character(sample(c(1:9, 1.4), 10)),
c = sample(c(-1.5:1.5, "word"), 10, replace = TRUE))
setDT(properties)
setDT(values)
properties[,func_name:=paste0("as.",datatype)]
for (i in names(values)){
set(values,j = i,value=match.fun(properties[property==i,func_name])(values[[i]]))
print(match.fun(properties[property==i,func_name]))
}
#> function (x, ...) .Primitive("as.logical")
#> function (x, ...) .Primitive("as.integer")
#> Warning in match.fun(properties[property == i, func_name])(values[[i]]): NAs
#> introduced by coercion
#> function (x, ...) .Primitive("as.double")
values
#> a b c
#> 1: FALSE 7 -1.5
#> 2: NA 1 NA
#> 3: FALSE 8 1.5
#> 4: FALSE 1 -0.5
#> 5: TRUE 6 -0.5
#> 6: NA 9 1.5
#> 7: FALSE 4 0.5
#> 8: FALSE 2 1.5
#> 9: NA 3 -1.5
#> 10: FALSE 5 -1.5
Created on 2020-04-20 by the reprex package (v0.3.0)
I changed your "boolean" to "logical" as there is no function called as.boolean
You can also try as.boolean <- as.logical
to avoid modify your exsiting codes.
Upvotes: 1
Reputation: 11
you can try the following code and see if it serves your purpose.
# Libraries
library(dplyr)
# Seed
set.seed(12345)
# Property example
properties <-
tibble(property = c("a", "b", "c"),
datatype = c("logical", "integer", "double"))
# Values example
values <-
tibble( a = as.logical(sample(c("TRUE", "FALSE", "CORRECT"), 10, replace = TRUE)),
b = sample(c(1:9, 1.4), 10) %>% ifelse( . -floor(.) == 0 , . ,NA ),
c = as.numeric(sample(c(-1.5:1.5, "word"), 10, replace = TRUE)) )
Warning message:
In eval_tidy(xs[[i]], unique_output) : NAs introduced by coercion
values
# A tibble: 10 x 3
a b c
<lgl> <dbl> <dbl>
1 FALSE 6 -1.5
2 TRUE NA -0.5
3 TRUE 3 1.5
4 FALSE 5 0.5
5 TRUE 2 -1.5
6 NA 8 0.5
7 TRUE 7 0.5
8 TRUE 9 1.5
9 NA 1 -1.5
10 NA 4 NA
You can check if it works for you, Cheers!
Upvotes: 1