merge multilingual datasets r

Question

I have data frames from surveys that were asked in different languages. Most of the variables are factors. The factor levels in each data frame are in the language of the survey. I would like to merge the data frames of different languages into one. Sample data is below along with the steps I think are needed and where I am stuck.

#minimal data

fr<-expand.grid( ques=c(factor(rep(c("oui","non"), each=2))), gender=c(factor(rep(c("femme","homme"), each=2))),chr=c(rep(c("c","g"), times=1)) )
en<-expand.grid( ques=c(factor(rep(c("yes","no"), each=2))), gender=c(factor(rep(c("man","woman"), each=2))),chr=c(rep(c("c","g"), times=1)) )
fr$chr<-as.character(fr$chr)
en$chr<-as.character(en$chr)

Step 1. compare factor levels for all factor variables to make sure they are coded the same across languages, for example, 1= yes and 1= oui. I can look at the comparison to assess, but am looking for an efficient way to pull factor levels for all factor variables in the data frame for this step.

Step 2 recode any factors for consistency across languages if needed.

levels(fr$gender)
levels(en$gender)

fr$gender <- factor(fr$gender, levels=c('homme','femme')) #reordered to match the English

Step 3. convert factor variables to numeric

fr.1<-fr %>% mutate(across(where(is.factor), as.numeric))
en.1<-en %>% mutate(across(where(is.factor), as.numeric))

Step 4. merge the df

multi<-rbind.data.frame(fr.1,en.1)

step 4. recode back to factors with English factor levels I would like to have the merged data frame have the variables as factors with the levels in English. I am not sure if this is possible or how to accomplish this step.

Nir Graham · Accepted Answer

fr <- expand.grid(ques = c(factor(rep(c("oui", "non"), each = 2))), gender = c(factor(rep(c("femme", "homme"), each = 2))), chr = c(rep(c("c", "g"), times = 1)))
en <- expand.grid(ques = c(factor(rep(c("yes", "no"), each = 2))), gender = c(factor(rep(c("man", "woman"), each = 2))), chr = c(rep(c("c", "g"), times = 1)))
fr$chr <- as.character(fr$chr)
en$chr <- as.character(en$chr)


library(tidyverse)


assess_frames <- function(lang_frames){

# list the factor vars
fac_vars <- names(select(get(lang_frames[[1]]), where(is.factor)))

# print out comparisons across language frames
map(fac_vars, ~ {
  fv <- .x
  # for each language count the levels to find the max
  maxl <- map_int(
    lang_frames,
    ~ get(.x) |>
      pull(fv) |>
      levels() |>
      length()
  ) |> max()
  
  df <- data.frame(lvl = seq_len(maxl))
  # for each language add the levels to the frame
  got <- map(
    lang_frames,
    ~ {
      vlev <- get(.x) |>
        pull(fv) |>
        levels()
      if (length(vlev) < maxl) {
        # need to pad it to be longer
        num_to_add <- maxl - length(vlev)
        vlev <- c(vlev, map_chr(
          seq_len(num_to_add),
          ~ str_pad(string = "",
                    width = .x)
        ))
      }
      tibble(!!.x := vlev)
    }
  )

  bind_cols(df, got)
})
}

assess_frames(c("fr", "en"))

en$gender <- forcats::fct_relevel(en$gender,"woman")

assess_frames(c("fr", "en"))

merge multilingual datasets r

Answers (1)

Related Questions