Reputation: 2725
I have a data frame that looks like
m1 m2 m3
P001.st 60.00 2.0 1
P003.nd 14.30 2.077 1
P003.rt 29.60 2.077 1
P006.st 10.30 2.077 1
P006.nd 79.30 2.077 1
P008.nd 9.16 2.077 1
I want to reformat table so that only first part (before period, i.e., P001, P003 etc) of the row name appear as row names and append the each subsequent rows with similar names to columns. The output should look like
m1st m2st m3st m1nd m2nd m3nd m1rt m2rt m3rt
P001 60.00 2.0 1 0 0 0 0 0 0
P003 0 0 0 14.30 2.077 1 29.60 2.077 1
P006 10.30 2.077 1 79.30 2.077 1 0 0 0
P008 0 0 0 9.16 2.077 1 0 0 0
The aggregate function like
aggregate(value~name, df, I)
or a method from data.table like
setDT(df)[, list(value=list(value)), by=name]
would not work because row.names are not exactly the same. Any suggestions for matching hundreds of rows with many variable subtypes (i.e, after period: .nd, .st etc).
Upvotes: 1
Views: 868
Reputation: 3488
Using extract()
instead of separate()
with the more flexible regular expressions, using tidyr
and dplyr
:
df %>%
extract(id, c("id2", "var"), c("(P00.)\\.(..)")) %>%
gather(variable,value,c(m1,m2,m3)) %>%
mutate(var=paste0(variable,".",var)) %>%
select(-variable) %>%
spread(var,value,fill=0)
id2 m1.nd m1.rt m1.st m2.nd m2.rt m2.st m3.nd m3.rt m3.st
1 P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
2 P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
3 P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
4 P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0
Upvotes: 1
Reputation: 49448
dt = as.data.table(your_df, keep.rownames = T)
# split the row names into two id's
dt[, `:=`(id1 = sub('\\..*', '', rn), id2 = sub('.*\\.', '', rn), rn = NULL)]
# melt and dcast (need latest 1.9.5 or have to load reshape2 and use dcast.data.table)
dcast(melt(dt, id.vars = c('id1', 'id2')), id1 ~ variable + id2, fill = 0)
# id1 m1_nd m1_rt m1_st m2_nd m2_rt m2_st m3_nd m3_rt m3_st
#1: P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
#2: P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
#3: P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
#4: P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0
Upvotes: 3
Reputation: 2022
Try this:
library(tidyr)
library(dplyr)
library(reshape2)
library(stringr)
data <-
structure(list(m1 = c(60, 14.3, 29.6, 10.3, 79.3, 9.16),
m2 = c(2, 2.077, 2.077, 2.077, 2.077, 2.077),
m3 = c(1L, 1L, 1L, 1L, 1L, 1L)),
.Names = c("m1", "m2", "m3"),
class = "data.frame",
row.names = c("P001.st", "P003.nd", "P003.rt",
"P006.st", "P006.nd", "P008.nd"))
my_data <-
as_data_frame(cbind(col_01 = rownames(data), data)) %>%
melt(.) %>%
separate(., col_01, into = c("var_01", "var_02"), sep = "\\.") %>%
mutate(my_var = str_c(variable, var_02)) %>%
select(var_01, my_var, value) %>%
arrange(var_01, my_var) %>%
spread(., my_var, value)
my_data
var_01 m1nd m1rt m1st m2nd m2rt m2st m3nd m3rt m3st
1 P001 NA NA 60.0 NA NA 2.000 NA NA 1
2 P003 14.30 29.6 NA 2.077 2.077 NA 1 1 NA
3 P006 79.30 NA 10.3 2.077 NA 2.077 1 NA 1
4 P008 9.16 NA NA 2.077 NA NA 1 NA NA
If you want to replace NAs with 0, you can do it like this:
my_data[is.na(my_data)] <- 0
var_01 m1nd m1rt m1st m2nd m2rt m2st m3nd m3rt m3st
1 P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
2 P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
3 P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
4 P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0
Upvotes: 1
Reputation: 61
If you have your data frame is called "data":
library(reshape2)
data$prefix <- gsub("(.*)\\..*","\\1",row.names(data))
data$suffix <- gsub(".*\\.(.*)","\\1",row.names(data))
data.melt <- melt(data)
data.melt
data.cast <- dcast(data.melt,prefix~variable+suffix,mean)
# set the row names to prefix
row.names(data.cast) <- data.cast$prefix
# get rid of the prefix column
data.cast <- data.cast[,-1]
data.cast
Gives
Using prefix, suffix as id variables
m1_nd m1_rt m1_st m2_nd m2_rt m2_st m3_nd m3_rt m3_st
P001 NaN NaN 60.0 NaN NaN 2.000 NaN NaN 1
P003 14.30 29.6 NaN 2.077 2.077 NaN 1 1 NaN
P006 79.30 NaN 10.3 2.077 NaN 2.077 1 NaN 1
P008 9.16 NaN NaN 2.077 NaN NaN 1 NaN NaN
To correct the column names and zeros instead of NaN, do
names(data.cast) <- gsub("_","",names(data.cast))
apply(data.cast,c(1,2),function(x){as.numeric(ifelse(is.na(x),0,x)) })
To get
m1nd m1rt m1st m2nd m2rt m2st m3nd m3rt m3st
P001 0.00 0.0 60.0 0.000 0.000 2.000 0 0 1
P003 14.30 29.6 0.0 2.077 2.077 0.000 1 1 0
P006 79.30 0.0 10.3 2.077 0.000 2.077 1 0 1
P008 9.16 0.0 0.0 2.077 0.000 0.000 1 0 0
Upvotes: 1
Reputation: 54237
Here's another way to do it:
library(dplyr)
library(tidyr)
(wide <- reshape(df %>% add_rownames() %>% separate(rowname, c("rowname", "id")),
idvar = "rowname",
timevar = "id",
direction = "wide",
sep = ""))
# rowname m1st m2st m3st m1nd m2nd m3nd m1rt m2rt m3rt
# 1 P001 60.0 2.000 1 NA NA NA NA NA NA
# 2 P003 NA NA NA 14.30 2.077 1 29.6 2.077 1
# 4 P006 10.3 2.077 1 79.30 2.077 1 NA NA NA
# 6 P008 NA NA NA 9.16 2.077 1 NA NA NA
wide[is.na(wide)] <- 0
rownames(wide) <- wide[, 1]
wide$rowname <- NULL
wide
# m1st m2st m3st m1nd m2nd m3nd m1rt m2rt m3rt
# P001 60.0 2.000 1 0.00 0.000 0 0.0 0.000 0
# P003 0.0 0.000 0 14.30 2.077 1 29.6 2.077 1
# P006 10.3 2.077 1 79.30 2.077 1 0.0 0.000 0
# P008 0.0 0.000 0 9.16 2.077 1 0.0 0.000 0
Upvotes: 2