Reputation: 72731
As part of piloting a survey, I presented each Turker with sets of choices amongst four alternatives. The data looks like this:
> so
WorkerId pio_1_1 pio_1_2 pio_1_3 pio_1_4 pio_2_1 pio_2_2 pio_2_3 pio_2_4
1 1 Yes No No No No No Yes No
2 2 No Yes No No Yes No Yes No
3 3 Yes Yes No No Yes No Yes No
I'd like it to look like this:
WorkerId set pio1 pio2 pio3 pio4
1 1 Yes No No No
1 2 No No Yes No
...
I can kludge through this by a number of means, none of which seem very elegant:
But it seems to me that all of these ignore the idea that data in what you might call "double wide" format has its own structure. I'd love to use the reshape2 package for this, but despite the data having been produced with cast() I don't see any options that would help me truly melt this data.frame back.
Suggestions welcome.
so <- structure(list(WorkerId = 1:3, pio_1_1 = structure(c(2L, 1L,
2L), .Label = c("No", "Yes"), class = "factor"), pio_1_2 = structure(c(1L,
2L, 2L), .Label = c("No", "Yes"), class = "factor"), pio_1_3 = structure(c(1L,
1L, 1L), .Label = c("No", "Yes"), class = "factor"), pio_1_4 = structure(c(1L,
1L, 1L), .Label = "No", class = "factor"), pio_2_1 = structure(c(1L,
2L, 2L), .Label = c("No", "Yes"), class = "factor"), pio_2_2 = structure(c(1L,
1L, 1L), .Label = c("No", "Yes"), class = "factor"), pio_2_3 = structure(c(2L,
2L, 2L), .Label = c("No", "Yes"), class = "factor"), pio_2_4 = structure(c(1L,
1L, 1L), .Label = "No", class = "factor")), .Names = c("WorkerId",
"pio_1_1", "pio_1_2", "pio_1_3", "pio_1_4", "pio_2_1", "pio_2_2",
"pio_2_3", "pio_2_4"), row.names = c(NA, 3L), class = "data.frame")
Upvotes: 6
Views: 542
Reputation: 72731
Here's what I eventually went with, largely based off @gauden 's approach. In re-reading Hadley's tidy data pdf, it turns out he advises a similar course.
melt.wide <- function(data, id.vars, new.names, sep=".", variable.name="variable", ... ) {
# Guess number of variables currently wide
colnames(data) <- sub( paste0(sep,"$"), "", colnames(data) )
wide.vars <- colnames(data)[grep( sep, colnames(data) )]
n.wide <- str_count( wide.vars, sep )
stopifnot(length(new.names)==unique(n.wide))
# Melt
data.melt <- melt(data,id.vars=id.vars,measure.vars=wide.vars,...)
new <- stack.list(str_split(data.melt$variable,sep))
colnames(new) <- c(variable.name,new.names)
data.melt <- subset(data.melt,select=c(-variable))
cbind(data.melt,new)
}
# Stacks lists of data.frames (e.g. from replicate() )
stack.list <- function( x, label=FALSE, ... ) {
ret <- x[[1]]
if(label) { ret$from <- 1 }
if(length(x)==1) return(ret)
for( i in seq(2,length(x)) ) {
new <- x[[i]]
if(label) { new$from <- i }
ret <- rbind(ret,new)
}
return(ret)
}
> dat<-melt.wide(so,id.vars="WorkerId",new.names=c("set","option"),sep="_")
> dcast(dat, WorkerId + set ~ option)
WorkerId set 1 2 3 4
1 1 1 Yes No No No
2 1 2 No No Yes No
3 2 1 No Yes No No
4 2 2 Yes No Yes No
5 3 1 Yes Yes No No
6 3 2 Yes No Yes No
Upvotes: 1
Reputation: 193517
Here's another solution using reshape2
and stringr
melt.wide = function(data, id.vars, new.names) {
require(reshape2)
require(stringr)
data.melt = melt(data, id.vars=id.vars)
new.vars = data.frame(do.call(
rbind, str_extract_all(data.melt$variable, "[0-9]+")))
names(new.vars) = new.names
cbind(data.melt, new.vars)
}
Then, you use it like this:
> so.long = melt.wide(so, id.vars=1, new.names=c("set", "option"))
> dcast(so.long, WorkerId + set ~ option)
WorkerId set 1 2 3 4
1 1 1 Yes No No No
2 1 2 No No Yes No
3 2 1 No Yes No No
4 2 2 Yes No Yes No
5 3 1 Yes Yes No No
6 3 2 Yes No Yes No
I think that using stringr
might prove to be a more simple solution than those in the functions that have been suggested so far.
Here's why I like this solution: it also works if your data is, say, triple wide. Here's an example, (with data modified from here):
triplewide = structure(list(ID = 1:4, w1d1t1 = c(4L, 3L, 2L, 2L), w1d1t2 = c(5L,
4L, 3L, 3L), w1d2t1 = c(6L, 5L, 5L, 4L), w1d2t2 = c(5L, 4L, 5L,
2L), w2d1t1 = c(6L, 5L, 4L, 3L), w2d1t2 = c(5L, 4L, 5L, 5L),
w2d2t1 = c(6L, 3L, 6L, 3L), w2d2t2 = c(7L, 4L, 3L, 2L)), .Names = c("ID",
"w1d1t1", "w1d1t2", "w1d2t1", "w1d2t2", "w2d1t1", "w2d1t2", "w2d2t1",
"w2d2t2"), class = "data.frame", row.names = c(NA, -4L))
This is what it looks like to start with:
> triplewide
ID w1d1t1 w1d1t2 w1d2t1 w1d2t2 w2d1t1 w2d1t2 w2d2t1 w2d2t2
1 1 4 5 6 5 6 5 6 7
2 2 3 4 5 4 5 4 3 4
3 3 2 3 5 5 4 5 6 3
4 4 2 3 4 2 3 5 3 2
A variable name like w1d1t1
means "week 1, day 1, test 1". Let's say that your expected "tidy data" should be a dataset with the columns "ID", "week", "day", "trial 1", and "trial 2", then you can use the function as follows:
> triplewide.long = melt.wide(triplewide, id.vars="ID",
+ new.names=c("week", "day", "trial"))
> dcast(triplewide.long, ID + week + day ~ trial)
ID week day 1 2
1 1 1 1 4 5
2 1 1 2 6 5
3 1 2 1 6 5
4 1 2 2 6 7
5 2 1 1 3 4
6 2 1 2 5 4
7 2 2 1 5 4
8 2 2 2 3 4
9 3 1 1 2 3
10 3 1 2 5 5
11 3 2 1 4 5
12 3 2 2 6 3
13 4 1 1 2 3
14 4 1 2 4 2
15 4 2 1 3 5
16 4 2 2 3 2
Upvotes: 4
Reputation: 109844
If we call your original data set dat this will do it using base:
dat2 <- reshape(dat,
varying=list(pio_1= c(2, 6), pio_2= c(3,7), pio_3= c(4,8), pio_4= c(5,9) ),
v.names=c(paste0("pio_",1:4)),
idvar = "WorkerId",
direction="long",
timevar="set")
row.names(dat2) <- NULL
dat2[order(dat2$WorkerId, dat2$set), ]
Which yields:
WorkerId set pio_1 pio_2 pio_3 pio_4
1 1 1 Yes No No No
2 1 2 No No Yes No
3 2 1 No Yes No No
4 2 2 Yes No Yes No
5 3 1 Yes Yes No No
6 3 2 Yes No Yes No
EDIT: (Alright I couldn't resist taking a crack at making it easier to automate)
y <- do.call('rbind', strsplit(names(dat)[-1], "_"))[, c(1, 3, 2)]
names(dat) <- c(names(dat)[1], paste0(y[, 1], "_", y[, 2], ".", y[, 3]))
dat2 <- reshape(dat,
varying=2:9,
idvar = "WorkerId",
direction="long",
timevar="set")
row.names(dat2) <- NULL
dat2[order(dat2$WorkerId, dat2$set), ]
Upvotes: 4
Reputation: 193517
I would suggest doing some gsub on your names to get them into a form that R likes better, in other words, with the time variable being the last item, not the middle item, and with a "." being the default separator.
Try this:
names(so) = gsub("([a-z])_([0-9])_([0-9])", "\\1_\\3\\.\\2", names(so))
so.l = reshape(so, direction="long", varying=2:9, timevar="set", idvar=1)
Then, if you want to sort by WorkerId:
so.l = so.l[order(so.l$WorkerId), ]
Upvotes: 4
Reputation: 10923
I am not sure if this is too obvious, but here goes. It should be self-explanatory. Pass in your so
dataframe and it returns the reshaped data.
library("reshape2")
reshape.middle <- function(dat) {
dat <- melt(so, id="WorkerId")
dat$set <- substr(dat$variable, 5,5)
dat$name <- paste(substr(dat$variable, 1, 4),
substr(dat$variable, 7, 7),
sep="")
dat$variable <- NULL
dat <- melt(dat, id=c("WorkerId", "set", "name"))
dat$variable <- NULL
return(dcast(dat, WorkerId + set ~ name))
}
so # initial form
so <- reshape.middle(so)
so # as needed
Hope this helps.
Upvotes: 3