Reputation: 55
I have a huge table with unarranged data in its rows (relative to the name of the columns). The table looks something like this:
A B C
A1 B1 C1
A2 C2 NA
A3 C3 B3
NA B4 C4
I was wondering if there is way to re-arrange this table so that all values that start with "A" will be on the first column and all values that start with "B" will be on the second column and so on...
The desired outcome will be something like this:
A B C
A1 B1 C1
A2 NA C2
A3 B3 C3
NA B4 C4
Upvotes: 4
Views: 275
Reputation: 12723
Here is one efficient way to approach it with data.table
library('data.table')
setDT(df1) # assign data table by reference
# melt df1 data with unique id for each row
df1 <- melt( data = df1[, id := .I], measure.vars = setdiff( names(df1), 'id' ))
# assign variable = NA which has value = NA, and variable = column_names for other values
df1[, variable := gsub("[0-9]", "", value)]
# rearrange df1 into multiple columns and remove NA and id columns
dcast(df1, "id ~ variable", value.var = "value")[, `:=` (`NA` = NULL, id = NULL ) ][]
Output:
# A B C
# 1: A1 B1 C1
# 2: A2 NA C2
# 3: A3 B3 C3
# 4: NA B4 C4
Data:
df1 <- read.table(text = "
A B C
A1 B1 C1
A2 C2 NA
A3 C3 B3
NA B4 C4",
header = TRUE, stringsAsFactors = FALSE)
EDIT:
using the data above.
library('data.table')
setDT(df1)[, D:= c( NA_character_)] # add column D
df1[, id := .I] # add unique id for each row
nm <- copy( setdiff( names(df1), 'id' ) ) # get column names and remove id
df1 <- dcast( data = melt( data = df1, measure.vars = nm)[, variable := gsub("[0-9]", "", value)],
formula = "id ~ variable",
# subsetting is done because, two NA values are passed to the aggregate function two times ( column D and A for row-4; column C and D for row-2 ).
# you can check it using this: function(x) print(x)
fun = function(x)x[1],
value.var = "value",
fill = NA_character_ )[, id := NULL ][]
# check for number of columns
if( ncol(df1) > length(nm) ){
df1[, `NA` := NULL ] # remove extra NA column
nm <- setdiff(names(df1), 'NA') # remove extra column name: NA
}
# rename columns efficiently by reference
setnames( df1, nm)
output
print(df1)
# A B C D
# 1: A1 B1 C1 NA
# 2: A2 NA C2 NA
# 3: A3 B3 C3 NA
# 4: NA B4 C4 NA
Upvotes: 1
Reputation: 47330
We can use pmatch
into an apply
call, and then reconvert as a data.frame
with proper names:
abc <- names(df)
setNames(as.data.frame(t(
apply(df,1,function(x) x[pmatch(abc,x)])
)),abc)
# A B C
# 1 A1 B1 C1
# 2 A2 <NA> C2
# 3 A3 B3 C3
# 4 <NA> B4 C4
This is how pmatch
works here:
x <- c("A2","C2",NA)
x[pmatch(abc,x)]
# [1] "A2" NA "C2"
Upvotes: 1
Reputation: 721
Here's a really bad solution:
rm(list=ls())
dataset<-read.csv("exampleSO.csv",header = F);
a<-as.matrix.data.frame(dataset);
loc<-as.vector(a[1,]);
ourM<-a[2:5,];
x<-matrix(data=NA, nrow=4,ncol=3)
for (i in 1:nrow(ourM)){
for(j in 1:ncol(ourM)){
place<-which(substr(ourM[i,j],1,1)==loc)
x[i,place] <- ourM[i,j]
}
}
as.matrix.data.frame(rbind(loc,x))
#result
[,1][,2] [,3]
[1,] "A" "B" "C"
[2,] "A1" "B1" "C1"
[3,] "A2" NA "C2"
[4,] "A3" "B3" "C3"
[5,] NA "B4" "C4"
Replace exampleSO.csv with your .csv
containing the data or any variable that contains that dataframe.
Tell me if it works!
Upvotes: 0
Reputation: 28695
For each row this finds, in df[1,]
, the first letter of each element, and moves that element to the same column as the match found in df[1,]
.
library(tidyverse)
df %>%
pmap(~{ x <- c(...)
ind <- map_dbl(substr(x, 1, 1), ~grep(.x, df[1,])[1]) %>%
.[!is.na(.)]
rep(NA, 3) %>%
`[<-`(ind, x[!is.na(x)])}) %>%
do.call(what = rbind) %>%
as.data.frame
# V1 V2 V3
# 1 A1 B1 C1
# 2 A2 <NA> C2
# 3 A3 B3 C3
# 4 <NA> B4 C4
Upvotes: 1
Reputation: 6132
read.table(text = "
A B C
A1 B1 C1
A2 C2 NA
A3 C3 B3
NA B4 C4",
header = T) -> df
dfnew <- matrix(c(paste0(rep(LETTERS[1:3], each = 4), seq(1:4))), ncol = 3, nrow = 4)
dfnew[!dfnew %in% sort(as.character(unlist(df)))] <- NA
dfnew
[,1] [,2] [,3]
[1,] "A1" "B1" "C1"
[2,] "A2" NA "C2"
[3,] "A3" "B3" "C3"
[4,] NA "B4" "C4"
In short: create a new matrix that contains all possible combinations of letters and numbers, and set those to NA which are not in the original table.
Upvotes: 1