user3789200
user3789200

Reputation: 1186

All the column NA values in a dataframe fill with median values in R

I need to fill the null values of all the numerical columns with each column's median value in a data frame. I did the following code.

median_forNumericalNulls <- function(dataframe){
  
  nums <- unlist(lapply(dataframe, is.numeric))  
  
  df_num <- dataframe[ , nums]
  
  df_num[] <- lapply(df_num, function(x) { 
    x[is.na(x)] <- median(x, na.rm = TRUE)
    x
  })      

  return(dataframe)
  
}

median_forNumericalNulls(A)

A is the parent table, which consists of both numerical as well as categorical variables. How can I replace the columns of 'A' dataframe with the output of the function median_forNumericalNulls?

Is there a better way that we can do the same?

Upvotes: 2

Views: 586

Answers (3)

hello_friend
hello_friend

Reputation: 5788

Base R solution:

# Function to deteremine data.frame vector indices that are numeric:
# resolve_num_cols => function() 
resolve_num_cols <- function(df){
  # Store a vector of numeric column names: 
  # num_cols => logical vector
  num_cols <- which(
    vapply(
      df, 
      is.numeric, 
      logical(1),
      USE.NAMES = FALSE
    )
  )
  # Explicitly define the returned object: logical vector => env
  return(num_cols)
}

# Function to impute median values for each numeric vector in data.frame
# impute_median_vals_in_df => function()
impute_median_vals_in_df <- function(df, num_col_idx){
  # Replace the na. values in each numeric vector: df => data.frame 
  df[,num_col_idx] <- lapply(
    num_col_idx, 
    function(col_idx){
      df[,col_idx] <- ifelse(
        is.na(df[,col_idx]),
        median(df[,col_idx], na.rm = TRUE),
        df[,col_idx]
      )
    }
  )
  # Return the data.frame object: data.frame => env
  return(df)
}

# Apply the function to resolve the numeric vectors in data.frame: 
# num_cols => integer vector
num_cols <- resolve_num_cols(df1)

# Apply the function: clean_df => data.frame
clean_df <- impute_median_vals_in_df(df1, num_cols)

Data used:

# Import data: df1 => data.frame
df1 <- structure(list(mpg = c(NA, 21, NA, 21.4, 18.7, 18.1, 14.3, 24.4, 
22.8, 19.2, NA, 16.4, 17.3, NA, NA, 10.4, 14.7, 32.4, 30.4, 33.9, 
21.5, NA, 15.2, 13.3, 19.2, 27.3, 26, NA, NA, 19.7, 15, 21.4), 
cyl = c(NA, 6, NA, 6, 8, NA, NA, 4, 4, 6, 6, 8, 8, 8, 8, 
8, 8, NA, NA, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, NA, NA, 4), disp = c(160, 
160, 108, 258, 360, 225, 360, 146.7, 140.8, 167.6, 167.6, 
NA, NA, 275.8, NA, 460, 440, 78.7, 75.7, 71.1, NA, 318, NA, 
350, 400, NA, NA, 95.1, 351, NA, 301, 121), hp = c(110, 110, 
93, 110, NA, 105, 245, 62, 95, 123, 123, 180, NA, 180, 205, 
215, NA, 66, NA, 65, 97, NA, NA, 245, 175, 66, 91, NA, 264, 
175, NA, 109), drat = c(3.9, 3.9, 3.85, 3.08, 3.15, 2.76, 
3.21, NA, 3.92, 3.92, 3.92, 3.07, NA, 3.07, 2.93, 3, 3.23, 
4.08, NA, 4.22, NA, 2.76, 3.15, 3.73, 3.08, NA, NA, 3.77, 
4.22, 3.62, NA, NA), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 
3.46, 3.57, NA, 3.15, 3.44, 3.44, 4.07, NA, NA, 5.25, 5.424, 
5.345, 2.2, 1.615, 1.835, NA, NA, 3.435, 3.84, NA, NA, NA, 
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61, 
19.44, NA, NA, NA, 20, NA, 18.3, 18.9, 17.4, 17.6, 18, NA, 
17.82, NA, 19.47, 18.52, 19.9, NA, 16.87, NA, 15.41, 17.05, 
18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6), vs = c(0, NA, 
1, 1, NA, NA, 0, NA, 1, NA, 1, 0, 0, 0, 0, 0, 0, 1, NA, 1, 
1, 0, 0, 0, NA, 1, NA, 1, 0, 0, 0, 1), am = c(NA, NA, NA, 
0, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, NA, 
0, 0, 0, NA, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3, NA, 
3, 3, NA, NA, 4, 4, 3, 3, 3, 3, 3, NA, 4, 4, 4, 3, 3, 3, 
3, 3, 4, 5, NA, NA, NA, NA, 4), carb = c(4, 4, 1, 1, 2, 1, 
4, NA, NA, 4, NA, 3, NA, 3, NA, 4, 4, 1, 2, 1, 1, 2, 2, 4, 
NA, NA, 2, 2, NA, 6, 8, 2)), row.names = c("Mazda RX4", "Mazda RX4 Wag", 
"Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant", 
"Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C", 
"Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", 
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", 
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", 
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2", 
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora", 
"Volvo 142E"), class = "data.frame")

Upvotes: 1

TarJae
TarJae

Reputation: 78927

Here is another approach how you could do it: Example:

librara(dplyr)
iris1 <- iris %>% 
  select(1, 2, 5)
head(iris1, 10) %>% 
  as_tibble() %>% 
  mutate(across(where(is.numeric), ~ifelse(.<= 3, NA, .))) %>% 
  mutate(across(where(is.numeric), ~ifelse(is.na(.), median(.,na.rm = TRUE), .)))
   Sepal.Length Sepal.Width Species
          <dbl>       <dbl> <fct>  
 1          5.1         3.5 setosa 
 2          4.9         3.4 setosa 
 3          4.7         3.2 setosa 
 4          4.6         3.1 setosa 
 5          5           3.6 setosa 
 6          5.4         3.9 setosa 
 7          4.6         3.4 setosa 
 8          5           3.4 setosa 
 9          4.4         3.4 setosa 
10          4.9         3.1 setosa 

Upvotes: 2

akrun
akrun

Reputation: 887048

May be we need to change the function to directly subset the columns and updating the columns, instead of creating another object and then updating

median_forNumericalNulls <- function(dataframe){
  
  nums <- unlist(lapply(dataframe, is.numeric))  
  
  df_num <- dataframe[ , nums]
  
  dataframe[nums] <- lapply(dataframe[nums], function(x) { 
    x[is.na(x)] <- median(x, na.rm = TRUE)
    x
  })      
  dataframe
  
}

-testing

A <- median_forNumericalNulls(A)

Also, this can be done in a compact way with na.aggregate though

library(zoo)
A <- na.aggregate(A, FUN = median)

Or using tidyverse

library(dplyr)
A <- A %>%
   mutate(across(where(is.numeric), 
         ~ replace(., is.na(.), median(., na.rm = TRUE))))

Upvotes: 2

Related Questions