Reputation: 1186
I need to fill the null values of all the numerical columns with each column's median value in a data frame. I did the following code.
median_forNumericalNulls <- function(dataframe){
nums <- unlist(lapply(dataframe, is.numeric))
df_num <- dataframe[ , nums]
df_num[] <- lapply(df_num, function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
x
})
return(dataframe)
}
median_forNumericalNulls(A)
A is the parent table, which consists of both numerical as well as categorical variables. How can I replace the columns of 'A' dataframe with the output of the function median_forNumericalNulls?
Is there a better way that we can do the same?
Upvotes: 2
Views: 586
Reputation: 5788
Base R solution:
# Function to deteremine data.frame vector indices that are numeric:
# resolve_num_cols => function()
resolve_num_cols <- function(df){
# Store a vector of numeric column names:
# num_cols => logical vector
num_cols <- which(
vapply(
df,
is.numeric,
logical(1),
USE.NAMES = FALSE
)
)
# Explicitly define the returned object: logical vector => env
return(num_cols)
}
# Function to impute median values for each numeric vector in data.frame
# impute_median_vals_in_df => function()
impute_median_vals_in_df <- function(df, num_col_idx){
# Replace the na. values in each numeric vector: df => data.frame
df[,num_col_idx] <- lapply(
num_col_idx,
function(col_idx){
df[,col_idx] <- ifelse(
is.na(df[,col_idx]),
median(df[,col_idx], na.rm = TRUE),
df[,col_idx]
)
}
)
# Return the data.frame object: data.frame => env
return(df)
}
# Apply the function to resolve the numeric vectors in data.frame:
# num_cols => integer vector
num_cols <- resolve_num_cols(df1)
# Apply the function: clean_df => data.frame
clean_df <- impute_median_vals_in_df(df1, num_cols)
Data used:
# Import data: df1 => data.frame
df1 <- structure(list(mpg = c(NA, 21, NA, 21.4, 18.7, 18.1, 14.3, 24.4,
22.8, 19.2, NA, 16.4, 17.3, NA, NA, 10.4, 14.7, 32.4, 30.4, 33.9,
21.5, NA, 15.2, 13.3, 19.2, 27.3, 26, NA, NA, 19.7, 15, 21.4),
cyl = c(NA, 6, NA, 6, 8, NA, NA, 4, 4, 6, 6, 8, 8, 8, 8,
8, 8, NA, NA, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, NA, NA, 4), disp = c(160,
160, 108, 258, 360, 225, 360, 146.7, 140.8, 167.6, 167.6,
NA, NA, 275.8, NA, 460, 440, 78.7, 75.7, 71.1, NA, 318, NA,
350, 400, NA, NA, 95.1, 351, NA, 301, 121), hp = c(110, 110,
93, 110, NA, 105, 245, 62, 95, 123, 123, 180, NA, 180, 205,
215, NA, 66, NA, 65, 97, NA, NA, 245, 175, 66, 91, NA, 264,
175, NA, 109), drat = c(3.9, 3.9, 3.85, 3.08, 3.15, 2.76,
3.21, NA, 3.92, 3.92, 3.92, 3.07, NA, 3.07, 2.93, 3, 3.23,
4.08, NA, 4.22, NA, 2.76, 3.15, 3.73, 3.08, NA, NA, 3.77,
4.22, 3.62, NA, NA), wt = c(2.62, 2.875, 2.32, 3.215, 3.44,
3.46, 3.57, NA, 3.15, 3.44, 3.44, 4.07, NA, NA, 5.25, 5.424,
5.345, 2.2, 1.615, 1.835, NA, NA, 3.435, 3.84, NA, NA, NA,
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
19.44, NA, NA, NA, 20, NA, 18.3, 18.9, 17.4, 17.6, 18, NA,
17.82, NA, 19.47, 18.52, 19.9, NA, 16.87, NA, 15.41, 17.05,
18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6), vs = c(0, NA,
1, 1, NA, NA, 0, NA, 1, NA, 1, 0, 0, 0, 0, 0, 0, 1, NA, 1,
1, 0, 0, 0, NA, 1, NA, 1, 0, 0, 0, 1), am = c(NA, NA, NA,
0, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, NA,
0, 0, 0, NA, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3, NA,
3, 3, NA, NA, 4, 4, 3, 3, 3, 3, 3, NA, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, NA, NA, NA, NA, 4), carb = c(4, 4, 1, 1, 2, 1,
4, NA, NA, 4, NA, 3, NA, 3, NA, 4, 4, 1, 2, 1, 1, 2, 2, 4,
NA, NA, 2, 2, NA, 6, 8, 2)), row.names = c("Mazda RX4", "Mazda RX4 Wag",
"Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant",
"Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C",
"Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), class = "data.frame")
Upvotes: 1
Reputation: 78927
Here is another approach how you could do it: Example:
librara(dplyr)
iris1 <- iris %>%
select(1, 2, 5)
head(iris1, 10) %>%
as_tibble() %>%
mutate(across(where(is.numeric), ~ifelse(.<= 3, NA, .))) %>%
mutate(across(where(is.numeric), ~ifelse(is.na(.), median(.,na.rm = TRUE), .)))
Sepal.Length Sepal.Width Species
<dbl> <dbl> <fct>
1 5.1 3.5 setosa
2 4.9 3.4 setosa
3 4.7 3.2 setosa
4 4.6 3.1 setosa
5 5 3.6 setosa
6 5.4 3.9 setosa
7 4.6 3.4 setosa
8 5 3.4 setosa
9 4.4 3.4 setosa
10 4.9 3.1 setosa
Upvotes: 2
Reputation: 887048
May be we need to change the function to directly subset the columns and updating the columns, instead of creating another object and then updating
median_forNumericalNulls <- function(dataframe){
nums <- unlist(lapply(dataframe, is.numeric))
df_num <- dataframe[ , nums]
dataframe[nums] <- lapply(dataframe[nums], function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
x
})
dataframe
}
-testing
A <- median_forNumericalNulls(A)
Also, this can be done in a compact way with na.aggregate
though
library(zoo)
A <- na.aggregate(A, FUN = median)
Or using tidyverse
library(dplyr)
A <- A %>%
mutate(across(where(is.numeric),
~ replace(., is.na(.), median(., na.rm = TRUE))))
Upvotes: 2