Reputation: 383
I have a text data file like this (with multiple rows for respective category [A,B,C]):
A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14
When I load to R it becomes a data frame with 6 obs. and 1 variable
df <- read.delim("file.text",header = FALSE)
v1
1 A=1,2,3,6,
2 7,9
3 10
4 B=3,4,5
5 C=5,7,8,10,11,
6 13,14
How can I change to this format?
A B C
1 A
2 A
3 A B
4 B
5 B C
6 A
7 A C
8 C
9 A
10 A C
11 C
13 C
14 C
Thanks!
Upvotes: 3
Views: 603
Reputation: 39667
In base you can paste
the lines to one line, use strsplit
to get the single numbers and the columnname, create a named matrix
and fill it by using subset.
x <- readLines(con=textConnection("A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14"))
x <- paste(gsub("=","",x), collapse = ",")
y <- lapply(strsplit(unlist(strsplit(x, "[[:alpha:]]+"))[-1], ","), function(i) i[!i==""])
names(y) <- strsplit(x, "[^[:alpha:]]+")[[1]]
z <- sort(unique(as.numeric(unlist(y))))
res <- matrix("", nrow=length(z), ncol=length(y), dimnames=list(z, names(y)))
sapply(names(y), function(i) res[y[[i]], i] <<- i)
res
# A B C
#1 "A" "" ""
#2 "A" "" ""
#3 "A" "B" ""
#4 "" "B" ""
#5 "" "B" "C"
#6 "A" "" ""
#7 "A" "" "C"
#8 "" "" "C"
#9 "A" "" ""
#10 "A" "" "C"
#11 "" "" "C"
#13 "" "" "C"
#14 "" "" "C"
Upvotes: 1
Reputation: 388982
Here is a tidyverse
approach.
library(dplyr)
library(tidyr)
df %>%
#Remove commas at the end of the string
mutate(V1 = sub(",$", "", V1)) %>%
#Divide data into separate column based on "=" sign
separate(V1, into = c("let", "num"), sep = "=", fill = "left") %>%
#fill the NA values with it's respective group
fill(let) %>%
#Convert comma-separated value into different rows
separate_rows(num, convert = TRUE) %>%
#Convert data to wide format filling missing values with empty string
pivot_wider(names_from = let, values_from = let, values_fill = list(let = "")) %>%
#Arrange data according to numbers
arrange(num)
# # A tibble: 13 x 4
# num A B C
# <int> <chr> <chr> <chr>
# 1 1 A "" ""
# 2 2 A "" ""
# 3 3 A B ""
# 4 4 "" B ""
# 5 5 "" B C
# 6 6 A "" ""
# 7 7 A "" C
# 8 8 "" "" C
# 9 9 A "" ""
#10 10 A "" C
#11 11 "" "" C
#12 13 "" "" C
#13 14 "" "" C
If you want num
column as rownames add %>% column_to_rownames('num')
.
data
df <- read.table(text = "A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14")
Upvotes: 3
Reputation: 193527
The basic approach would be to read the data in as a string, split on =
and ,
, and then figure out the best way to identify which group each number belongs with.
In the following approach, I've intentionally used type conversion to create the groups. Since this will involve coercion from character to numeric, since there are characters in the split values, expect several warning messages.
# Load the data.table package.
library(data.table)
# Read in the data.
x <- fread("A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14", sep = "\n", header = FALSE)
x[, unlist(strsplit(V1, "=|,"), use.names = FALSE, recursive = FALSE), .I][
, list(ind = as.integer(V1), col = rep(V1[1], .N)), cumsum(is.na(as.integer(V1)))][
, dcast(na.omit(.SD), ind ~ col, value.var = "col", fill = "")]
# ind A B C
# 1: 1 A
# 2: 2 A
# 3: 3 A B
# 4: 4 B
# 5: 5 B C
# 6: 6 A
# 7: 7 A C
# 8: 8 C
# 9: 9 A
# 10: 10 A C
# 11: 11 C
# 12: 13 C
# 13: 14 C
Here's another alternative using cSplit
from my "splitstackshape" package. "x" is the same data read in using fread
.
library(splitstackshape)
cSplit(
cSplit(x[, toString(V1), cumsum(grepl("[A-Z]", V1))], "V1", "="), "V1_2", ",", "long")[
, dcast(.SD, V1_2 ~ V1_1, value.var = "V1_1", fill = "")]
Upvotes: 7
Reputation: 72848
A base R approach using readLines()
. We first store cell indexes starting with characters in a vector ch
, which helps to find sequences to paste
together into l2
. l2
as.numeric
already gives the indexes for the final data frame d
. We construct d
out of NA
s from the dimensions of l3
and fill it up according to the values of l3
.
l <- readLines("delim.txt")
ch <- c(grep("^\\D", l))
l2 <- apply(rbind(ch, c(ch[-1] - 1, length(l))), 2,
function(x) Reduce(paste, l[x[1]:x[2]]))
l3 <- lapply(sapply(strsplit(l2, "\\D"), as.numeric), na.omit)
d <- matrix(NA, max(sapply(l3, max)), length(l3))
let <- gsub("(.*)\\=.*", "\\1", l2) # the col-names
sapply(seq(length(l3)), function(n) d[l3[[n]], n] <<- let[n])
setNames(as.data.frame(d), let)
# A B C
# 1 A <NA> <NA>
# 2 A <NA> <NA>
# 3 A B <NA>
# 4 <NA> B <NA>
# 5 <NA> B C
# 6 A <NA> <NA>
# 7 A <NA> C
# 8 <NA> <NA> C
# 9 A <NA> <NA>
# 10 A <NA> C
# 11 <NA> <NA> C
# 12 <NA> <NA> <NA>
# 13 <NA> <NA> C
# 14 <NA> <NA> C
Upvotes: 1