cho7tom
cho7tom

Reputation: 1070

how to clean survey data?

How should I proceed (via R to tidy my dataset in the following way:

input

enter image description here

expected output

enter image description here

Leveraging tidyr's package

I am thinking of using tidyrbut I could not figure out how to proceed yet. Any proposal?

Data

input
input <- structure(list(ID = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 2L, 3L, 4L), 
                               .Label = c("obs 1", "obs 10", "obs 11", "obs 12", "obs 2", "obs 3", 
                                          "obs 4", "obs 5", "obs 6", "obs 7", "obs 8", "obs 9"), 
                               class = "factor"), 
                Proposal.1...first = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L), 
                                               .Label = c("", "first"), class = "factor"), 
                Proposal.1...second = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L), 
                                                .Label = c("", "second"), class = "factor"), 
                Proposal.1...last = structure(c(1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
                                              .Label = c("", "last"), class = "factor"), 
                Proposal.2...first = structure(c(1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L), 
                                               .Label = c("", "first"), class = "factor"), 
                Proposal.2...second = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
                                                .Label = c("", "second"), class = "factor"), 
                Proposal.2...last = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L), 
                                              .Label = c("", "last"), class = "factor"), 
                Proposal.3...first = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), 
                                               .Label = c("", "first"), class = "factor"), 
                Proposal.3...second = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L), 
                                                .Label = c("", "second"), class = "factor"), 
                Proposal.3...last = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
                                              .Label = c("", "last"), class = "factor"), 
                Proposal.4...first = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L), 
                                               .Label = c("", "first"), class = "factor"), 
                Proposal.4...second = structure(c(1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
                                                .Label = c("", "second"), class = "factor"), 
                Proposal.4...last = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L), 
                                              .Label = c("", "last"), class = "factor")), 
           .Names = c("ID", "Proposal.1...first", "Proposal.1...second", "Proposal.1...last", "Proposal.2...first", 
                      "Proposal.2...second", "Proposal.2...last", "Proposal.3...first","Proposal.3...second", 
                      "Proposal.3...last", "Proposal.4...first", "Proposal.4...second", "Proposal.4...last"), 
           class = "data.frame", 
           row.names = c(NA, -12L))
expected output
output <- structure(list(ID = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 2L, 3L, 4L), 
                                    .Label = c("obs 1", "obs 10", "obs 11", "obs 12", "obs 2", "obs 3", "obs 4", "obs 5", 
                                               "obs 6", "obs 7", "obs 8", "obs 9"), class = "factor"), 
                     first = structure(c(1L, 1L, 2L, 4L, 2L, 3L, 2L, 4L, 1L, 1L, 4L, 2L), 
                                       .Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor"), 
                     second = structure(c(2L, 4L, 3L, 3L, 4L, 1L, 3L, 1L, 3L, 3L, 3L, 3L), 
                                        .Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor"), 
                     last = structure(c(3L, 3L, 1L, 1L, 1L, 2L, 4L, 2L, 4L, 4L, 2L, 4L), 
                                      .Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor")), 
                .Names = c("ID", "first", "second", "last"), class = "data.frame", row.names = c(NA, -12L))

Thanks!

Upvotes: 3

Views: 271

Answers (1)

scoa
scoa

Reputation: 19867

With dplyr and tidyr, you could use a combination of gather and spread:

library(dplyr)
library(tidyr)

gather(input,proposal,value,-ID) %>% ## turn to long form
  mutate(proposal=sub("\\.{3}.*","",proposal)) %>% ## removes "- first|second|last" from proposal
  mutate(proposal=sub("\\."," ",proposal)) %>% ## Not needed, but cleaner: changes "." to " " in proposal
  filter(value != "") %>% ## removes lines with empty value
  spread(value,proposal) %>% ## turn to wide form
  select(ID,first,second,last) %>% ## Not needed, but cleaner: order columns
  arrange(as.numeric(sub("obs ","",ID))) ## Not needed, but cleaner: order rows

output

       ID      first     second       last
1   obs 1 Proposal 1 Proposal 2 Proposal 3
2   obs 2 Proposal 1 Proposal 4 Proposal 3
3   obs 3 Proposal 2 Proposal 3 Proposal 1
4   obs 4 Proposal 4 Proposal 3 Proposal 1
5   obs 5 Proposal 2 Proposal 4 Proposal 1
6   obs 6 Proposal 3 Proposal 1 Proposal 2
7   obs 7 Proposal 2 Proposal 3 Proposal 4
8   obs 8 Proposal 4 Proposal 1 Proposal 2
9   obs 9 Proposal 1 Proposal 3 Proposal 4
10 obs 10 Proposal 1 Proposal 3 Proposal 4
11 obs 11 Proposal 4 Proposal 3 Proposal 2
12 obs 12 Proposal 2 Proposal 3 Proposal 4

Upvotes: 7

Related Questions