Reputation: 7107

applying a function across columns by extracting similar column names

My data looks like:

[[1]]
        date germany france germany_mean france_mean germany_sd france_sd
1 2016-01-01      17     25     21.29429    48.57103   30.03026  47.05169

What I am trying to do is to compute the following calculation over all the lists using map.

germany_calc = (germany - germany_mean) / germany_sd 
france_calc = (france - france_mean) / france_sd

However the number of columns can change - here there are two categories/countries but in another list there could be 1 or 3 or N. The countries always follow the same structure. That is,

"country1", "country2", ... , "countryN", "country1_mean", "country2_mean", ... , "countryN_mean", "country1_sd", "country2_sd", ... , "countryN_sd".

Expected Output (for the first list):

Germany: -0.1429988 =  (17 - 21.29429) / 30.03026 
France: -0.5009603 = (25 - 48.57103) / 47.05169

EDIT: Apologies - expected output:

-0.1429988
-0.5009603

Function:

Scale_Me <- function(x){
  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}

Data:

    my_list <- list(structure(list(date = structure(16801, class = "Date"), 
    germany = 17, france = 25, germany_mean = 21.2942922374429, 
    france_mean = 48.5710301846855, germany_sd = 30.030258443028, 
    france_sd = 47.0516928425878), class = "data.frame", row.names = c(NA, 
-1L)), structure(list(date = structure(16802, class = "Date"), 
    germany = 9, france = 29, germany_mean = 21.2993150684932, 
    france_mean = 48.5605316914534, germany_sd = 30.0286190461173, 
    france_sd = 47.0543871206842), class = "data.frame", row.names = c(NA, 
-1L)), structure(list(date = structure(16803, class = "Date"), 
    germany = 8, france = 18, germany_mean = 21.2947488584475, 
    france_mean = 48.551889593794, germany_sd = 30.0297291333284, 
    france_sd = 47.0562416513092), class = "data.frame", row.names = c(NA, 
-1L)), structure(list(date = structure(16804, class = "Date"), 
    germany = 3, france = 11, germany_mean = 21.2778538812785, 
    france_mean = 48.5382545766386, germany_sd = 30.0267943793948, 
    france_sd = 47.0607680244109), class = "data.frame", row.names = c(NA, 
-1L)), structure(list(date = structure(16805, class = "Date"), 
    germany = 4, france = 13, germany_mean = 21.2614155251142, 
    france_mean = 48.5214531240057, germany_sd = 30.0269420596686, 
    france_sd = 47.0676011750263), class = "data.frame", row.names = c(NA, 
-1L)), structure(list(date = structure(16806, class = "Date"), 
    germany = 4, france = 9, germany_mean = 21.253196347032, 
    france_mean = 48.5055948249362, germany_sd = 30.0292032528186, 
    france_sd = 47.0737183354519), class = "data.frame", row.names = c(NA, 
-1L)))

Upvotes: 6

Answers (4)

G. Grothendieck

Reputation: 269586

The question is unclear on the exact form of output so we assume that what is wanted is a data frame with a column for date and a column for each country in which the country value is normalized. In this case it means we want 3 columns in the output.

1) pivot_longer/_wider Bind the my_list list components together creating a data frame with a row from each component. Then for each bare country name among the columns append _root to it so that every column name except date is of the form country_suffix. Then convert to long form, perform the normalization and convert back to wide form:

library(dplyr)
library(tidyr)
library(purrr)

my_list %>%
  bind_rows %>%
  set_names(names(.)[1], sub("^([^_]*)$", "\\1_root", names(.)[-1])) %>%
  pivot_longer(-date, names_to = c("country", ".value"), names_sep = "_") %>%
  mutate(root = (root - mean) / sd) %>%
  pivot_wider(id_cols = "date", names_from = "country", values_from = "root")

giving:

# A tibble: 6 x 3
  date       germany france
  <date>       <dbl>  <dbl>
1 2016-01-01  -0.143 -0.501
2 2016-01-02  -0.410 -0.416
3 2016-01-03  -0.443 -0.649
4 2016-01-04  -0.609 -0.798
5 2016-01-05  -0.575 -0.755
6 2016-01-06  -0.575 -0.839

2) Base R

After rbinding the list components together giving d we pick out the country names, nms, as those names not containing an underscore except for the first such (which is date). Then perform the normalization and cbind the date column to that.

d <- do.call("rbind", my_list)
nms <- grep("_", names(d), invert = TRUE, value = TRUE)[-1]
cbind(d[1], (d[nms] - d[paste0(nms, "_mean")]) / d[paste0(nms, "_sd")])

giving:

        date    germany     france
1 2016-01-01 -0.1429988 -0.5009603
2 2016-01-02 -0.4095864 -0.4157005
3 2016-01-03 -0.4427196 -0.6492633
4 2016-01-04 -0.6087181 -0.7976550
5 2016-01-05 -0.5748642 -0.7546901
6 2016-01-06 -0.5745473 -0.8392283

Upvotes: 4

akrun

Reputation: 887118

We can use transform as well in base R

transform(do.call(rbind, my_list), 
  germany = (germany - germany_mean)/germany_sd, 
   france = (france - france_mean)/france_sd)[c('date', 'germany', 'france')]
#     date    germany     france
#1 2016-01-01 -0.1429988 -0.5009603
#2 2016-01-02 -0.4095864 -0.4157005
#3 2016-01-03 -0.4427196 -0.6492633
#4 2016-01-04 -0.6087181 -0.7976550
#5 2016-01-05 -0.5748642 -0.7546901
#6 2016-01-06 -0.5745473 -0.8392283

Or in dplyr, without any reshaping, this can be done

library(dplyr)
bind_rows(my_list) %>% 
   transmute(date,
             germany = (germany - germany_mean)/germany_sd,
             france = (france - france_mean)/france_sd)

Upvotes: 4

jay.sf

Reputation: 72813

Why not just rbind the thing?

with(do.call(rbind, my_list), 
     cbind(germany=(germany - germany_mean) / germany_sd,
           france=(france - france_mean) / france_sd))
#         germany     france
# [1,] -0.1429988 -0.5009603
# [2,] -0.4095864 -0.4157005
# [3,] -0.4427196 -0.6492633
# [4,] -0.6087181 -0.7976550
# [5,] -0.5748642 -0.7546901
# [6,] -0.5745473 -0.8392283

Upvotes: 5

dc37

Reputation: 16178

Do you have to use map ? Here I get your desired output using two for loops instead of using map

Result_list = vector("list",length(my_list))
for(i in 1:length(my_list))
{
  df = my_list[[i]]
  # identifier number of countries
  countries = colnames(df)[grep('mean',colnames(df))]
  countries = gsub("_mean","",countries)

  df_result = NULL
  for(j in 1:length(countries))
  {
    country = countries[j]
    value_country = df[1,match(country,colnames(df))]
    mean_country = df[1,match(paste0(country,"_mean"),colnames(df))]
    sd_country = df[1,match(paste0(country,"_sd"),colnames(df))]

    result_country = (value_country - mean_country) / sd_country
    Sentence = paste0(country,": ",round(result_country,5)," = (",value_country," - ",round(mean_country,5),") / ",round(sd_country,5))
    df_result = c(df_result,Sentence)
  }
  Result_list[[i]] = df_result
}

And the output Result_list looks like:

> Result_list
[[1]]
[1] "germany: -0.143 = (17 - 21.29429) / 30.03026" 
[2] "france: -0.50096 = (25 - 48.57103) / 47.05169"

[[2]]
[1] "germany: -0.40959 = (9 - 21.29932) / 30.02862"
[2] "france: -0.4157 = (29 - 48.56053) / 47.05439" 

[[3]]
[1] "germany: -0.44272 = (8 - 21.29475) / 30.02973"
[2] "france: -0.64926 = (18 - 48.55189) / 47.05624"

[[4]]
[1] "germany: -0.60872 = (3 - 21.27785) / 30.02679"
[2] "france: -0.79765 = (11 - 48.53825) / 47.06077"

[[5]]
[1] "germany: -0.57486 = (4 - 21.26142) / 30.02694"
[2] "france: -0.75469 = (13 - 48.52145) / 47.0676" 

[[6]]
[1] "germany: -0.57455 = (4 - 21.2532) / 30.0292" 
[2] "france: -0.83923 = (9 - 48.50559) / 47.07372"

Is it what you are looking for ?

EDIT: Extracting only results

For extracting only result values, you can do the following:

Df_result_value = NULL
for(i in 1:length(my_list))
{
  df = my_list[[i]]
  # identifier number of countries
  countries = colnames(df)[grep('mean',colnames(df))]
  countries = gsub("_mean","",countries)

  for(j in 1:length(countries))
  {
    country = countries[j]
    value_country = df[1,match(country,colnames(df))]
    mean_country = df[1,match(paste0(country,"_mean"),colnames(df))]
    sd_country = df[1,match(paste0(country,"_sd"),colnames(df))]

    result_country = (value_country - mean_country) / sd_country

    Df_result_value = rbind(Df_result_value,c(country,result_country))
  }
}
Df_result_value = data.frame(Df_result_value)
colnames(Df_result_value) = c("Country","Result")

And get this output:

> Df_result_value
   Country             Result
1  germany -0.142998843835787
2   france -0.500960300483614
3  germany -0.409586436512588
4   france -0.415700488060442
5  germany -0.442719572974515
6   france -0.649263275639099
7  germany -0.608718121899195
8   france -0.797654950237258
9  germany -0.574864249939699
10  france -0.754690110335453
11 germany -0.574547256608035
12  france -0.839228262008441

Upvotes: 4

applying a function across columns by extracting similar column names

Answers (4)

Related Questions