Reputation: 297

Replacing NULL value in Dataframe in R with Median of Column

I have a dataframe with multiple NULL values. The class type of the columns are LIST not NUMERIC. Is it possible to replace all the NULL values with the median value of the column? I tried a manual way was to change the NULL value of the column 1 by 1, using the as.numeric() function and subsequently apply the median() function. Is there a more efficient way to do this?

i1 <- sapply(pivot_table_1$`Start Working`, is.null)
pivot_table_1$`Start Working`[i1] <- 0

Output from dput():

structure(list(Day = 1:31, `Start Sleeping` = list(0, 20, 35, 
    40, 50, 0, 40, 0, 0, 40, 50, 0, 0, 40, 0, 40, 35, 45, 0, 
    0, 65, 35, 40, 40, 0, 50, 40, 0, 0, 0, 0), `Stop Sleeping` = list(
    440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 
    440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 
    440, 440, 440, 440, 440, 440, 440), `Start Working` = list(
    490, 490, 490, 490, 0, 0, 490, 490, 490, 490, 490, 0, 0, 
    490, 490, 490, 490, 490, 0, 0, 490, 490, 490, 490, 490, 0, 
    0, 490, 490, 490, 490), `Stop Working` = list(1005, 1005, 
    1005, 1005, NULL, NULL, 965, 965, 965, 965, 965, NULL, NULL, 
    965, 965, 965, 965, 965, NULL, NULL, 965, 965, 965, 965, 
    965, NULL, NULL, 965, 965, 965, 965), Breakfast = list(690, 
    645, 615, 540, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
    475, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 475, NULL, 
    NULL, NULL, NULL, NULL, 475, NULL, NULL, NULL, NULL, NULL), 
    Dinner = list(1390, 1360, 1285, 1270, 1390, NULL, 1140, 1140, 
        1130, 1135, 1130, NULL, 1165, 1140, 1130, 1135, 1130, 
        1140, 1140, 1180, NULL, 1145, 1135, 1140, 1135, 1160, 
        1140, 1140, NULL, 1140, NULL)), row.names = c(NA, -31L
), class = c("tbl_df", "tbl", "data.frame"))

Upvotes: 4

Answers (3)

PaulS

Reputation: 25323

Another possible solution:

library(tidyverse)

df %>% 
  mutate(across(-Day, ~ ifelse(lengths(.x) == 0, median(unlist(.x)), .x))) 

#> # A tibble: 31 × 7
#>      Day `Start Sleeping` `Stop Sleeping` `Start Working` `Stop Working`
#>    <int> <list>           <list>          <list>          <list>        
#>  1     1 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  2     2 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  3     3 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  4     4 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  5     5 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  6     6 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  7     7 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  8     8 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  9     9 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#> 10    10 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#> # … with 21 more rows, and 2 more variables: Breakfast <list>, Dinner <list>

Unnesting:

library(tidyverse)

df %>% 
  mutate(across(-Day, ~ ifelse(lengths(.x) == 0, median(unlist(.x)), .x))) %>% 
  unnest(everything())

#> # A tibble: 31 × 7
#>      Day `Start Sleeping` `Stop Sleeping` `Start Working` `Stop Working`
#>    <int>            <dbl>           <dbl>           <dbl>          <dbl>
#>  1     1                0             440             490           1005
#>  2     2               20             440             490           1005
#>  3     3               35             440             490           1005
#>  4     4               40             440             490           1005
#>  5     5               50             440               0            965
#>  6     6                0             440               0            965
#>  7     7               40             440             490            965
#>  8     8                0             440             490            965
#>  9     9                0             440             490            965
#> 10    10               40             440             490            965
#> # … with 21 more rows, and 2 more variables: Breakfast <dbl>, Dinner <dbl>

Upvotes: 0

Darren Tsai

Reputation: 35554

replace_na() from tidyr can be used to replace NULLs in a list. (NULLs are the list-column equivalent of NAs)

library(tidyverse)

replace_na(df, map(keep(df, is.list), ~ list(median(unlist(.x)))))

# # A tibble: 31 × 7
#      Day `Start Sleeping` `Stop Sleeping` `Start Working` `Stop Working` Breakfast Dinner   
#    <int> <list>           <list>          <list>          <list>         <list>    <list>   
#  1     1 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>      <dbl [1]> <dbl [1]>
#  2     2 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>      <dbl [1]> <dbl [1]>
#  3     3 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>      <dbl [1]> <dbl [1]>
#  4     4 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>      <dbl [1]> <dbl [1]>
#  5     5 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>      <dbl [1]> <dbl [1]>
#  6     6 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>      <dbl [1]> <dbl [1]>
#  7     7 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>      <dbl [1]> <dbl [1]>
#  8     8 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>      <dbl [1]> <dbl [1]>
#  9     9 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>      <dbl [1]> <dbl [1]>
# 10    10 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>      <dbl [1]> <dbl [1]>
# # … with 21 more rows

If you want these list-columns to be flattened, try unnest():

replace_na(df, map(keep(df, is.list), ~ list(median(unlist(.x))))) %>%
  unnest(where(is.list))

# # A tibble: 31 × 7
#      Day `Start Sleeping` `Stop Sleeping` `Start Working` `Stop Working` Breakfast Dinner
#    <int>            <dbl>           <dbl>           <dbl>          <dbl>     <dbl>  <dbl>
#  1     1                0             440             490           1005       690   1390
#  2     2               20             440             490           1005       645   1360
#  3     3               35             440             490           1005       615   1285
#  4     4               40             440             490           1005       540   1270
#  5     5               50             440               0            965       540   1390
#  6     6                0             440               0            965       540   1140
#  7     7               40             440             490            965       540   1140
#  8     8                0             440             490            965       540   1140
#  9     9                0             440             490            965       540   1130
# 10    10               40             440             490            965       540   1135

Upvotes: 1

Allan Cameron

Reputation: 173793

If you wish to keep the entries as length-one lists you can do:

pivot_table_1[] <- lapply(pivot_table_1, function(x) {
    ifelse(lengths(x) == 1, x, list(median(unlist(x))))})

pivot_table_1
#> # A tibble: 31 x 7
#>      Day `Start Sleeping` `Stop Sleeping` `Start Working` `Stop Working`
#>    <int> <list>           <list>          <list>          <list>        
#>  1     1 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  2     2 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  3     3 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  4     4 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  5     5 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  6     6 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  7     7 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  8     8 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#>  9     9 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#> 10    10 <dbl [1]>        <dbl [1]>       <dbl [1]>       <dbl [1]>     
#> # ... with 21 more rows, and 2 more variables: Breakfast <list>, Dinner <list>

Or, if you want them as numeric columns, do:

pivot_table_1[] <- lapply(pivot_table_1, function(x) {
    unlist(ifelse(lengths(x) == 1, x, list(median(unlist(x)))))})

pivot_table_1
#> # A tibble: 31 x 7
#>      Day `Start Sleeping` `Stop Sleeping` `Start Working` `Stop Working`
#>    <int>            <dbl>           <dbl>           <dbl>          <dbl>
#>  1     1                0             440             490           1005
#>  2     2               20             440             490           1005
#>  3     3               35             440             490           1005
#>  4     4               40             440             490           1005
#>  5     5               50             440               0            965
#>  6     6                0             440               0            965
#>  7     7               40             440             490            965
#>  8     8                0             440             490            965
#>  9     9                0             440             490            965
#> 10    10               40             440             490            965
#> # ... with 21 more rows, and 2 more variables: Breakfast <dbl>, Dinner <dbl>

^{Created on 2022-05-22 by the reprex package (v2.0.1)}

Upvotes: 2

Replacing NULL value in Dataframe in R with Median of Column

Answers (3)

Related Questions