Reputation: 704
I have the following data in a text file
RecordID:Name1
VariableA:14
VariableB: 34
VariableC: 21
VariableD:red
VariableE: low
RecordID: Name2
VariableA: 31
VariableB: 21
VariableD:yellow
VariableE:high
and so on, several thousands records.
Each record starts then with a RecordID: and an identifier Occasionally some variable data is missing like in second record there is no Variable D. Also sometimes a variable cell can span accross two rows when a text string is split. Therefore I cannot rely on row numbers only on Variable names.
How to read such data into an R in a tidy form in columns:
RecordID | VariableA | VariableB | Variable C | Variable D| Variable E
----------------------------------------------------------------------
Name1 14 34 21 red low
Name2 31 21 yellow high
Upvotes: 3
Views: 1933
Reputation: 11965
This file is in DCF format so another option could be to read it directly using read.dcf
#read text file
txt <- readLines("test.txt")
#insert a blank line before each line having "RecordID" so that the subsequent line is identified as a new record
record_num <- grep("RecordID", txt)
for (i in seq(length(record_num))) {
txt <- append(txt, "", after = grep("RecordID", txt)[i] - 1)
}
#now read dcf file and convert it to a dataframe
df <- as.data.frame(read.dcf(textConnection(txt)))
Output is:
> df
RecordID VariableA VariableB VariableC VariableD VariableE
1 Name1 14 34 21 red low
2 Name2 31 21 <NA> yellow high
Sample data:
test.txt
contains
RecordID:Name1
VariableA:14
VariableB: 34
VariableC: 21
VariableD:red
VariableE: low
RecordID: Name2
VariableA: 31
VariableB: 21
VariableD:yellow
VariableE:high
Upvotes: 3
Reputation: 39154
A solution using tidyverse. Assuming that the data frame in the original format is dat
. dat2
is the final output. Notice that to create dat
by using the read.table
command, we can specify sep = :
to read in the data.
library(tidyverse)
dat2 <- dat %>%
mutate_all(funs(str_trim(.))) %>% # Trim the white space for all columns
mutate(RecordID = ifelse(V1 %in% "RecordID",
V2, NA)) %>% # Create a new column with the name from V2 when V1 is RecordID
fill(RecordID) %>% # Fill in NA in the RecordID column
filter(!V1 %in% "RecordID") %>% # Remove V1 == "RecordID"
spread(V1, V2, convert = TRUE) # Spread the data frame
dat2
# RecordID VariableA VariableB VariableC VariableD VariableE
# 1 Name1 14 34 21 red low
# 2 Name2 31 21 NA yellow high
DATA
dat <- read.table(text = "RecordID:Name1
VariableA:14
VariableB: 34
VariableC: 21
VariableD:red
VariableE: low
RecordID: Name2
VariableA: 31
VariableB: 21
VariableD:yellow
VariableE:high", sep = ":", stringsAsFactors = FALSE)
Upvotes: 1
Reputation: 6771
There are a few things you need to do. First is getting the text into a data frame. I used magrittr
pipe operators to make the process a little clearer, but it's essentially splitting by lines and then by colons.
library(magrittr)
my_d <- "RecordID:Name1
VariableA:14
VariableB: 34
VariableC: 21
VariableD:red
VariableE: low
RecordID: Name2
VariableA: 31
VariableB: 21
VariableD:yellow
VariableE:high" %>%
strsplit("\n") %>%
unlist() %>%
strsplit(., ":") %>%
data.frame() %>%
t() %>%
as.data.frame(stringsAsFactors=FALSE)
rownames(my_d) <- 1:dim(my_d)[1]
colnames(my_d) <- c("colname", "value")
After that we have a data frame my_d
with two columns. Then we need to add a column specifying for each VariableX
, which RecordID
it belongs to...
my_d$is_RecordID <- my_d$colname == "RecordID"
runs <- rle(my_d$is_RecordID)
new_lengths <- runs$lengths %>% matrix(ncol=2) %>% apply(., 2, "sum")
new_values <- my_d$value[my_d$is_RecordID]
runs$lengths <- new_lengths
runs$values <- new_values
my_d$RecordID <- inverse.rle(runs)
Then reshape the "long" data frame into "wide" format.
my_d_long <- my_d[!my_d$is_RecordID, c("RecordID", "colname", "value")]
my_d_wide <- reshape(my_d_long, idvar = "RecordID", timevar = "colname", direction = "wide")
Upvotes: 1
Reputation: 70653
This is how I would do it.
library(tidyr)
xy <- readLines(con = "test.txt") # this is your data file, read it line-wise
out <- data.frame(temp = xy)
find.record <- grepl("RecordID:", out$temp) # identify where record starts
# create RecordID column
out$RecordID <- NA
out[find.record, "RecordID"] <- as.character(out[find.record, "temp"])
out <- fill(out, "RecordID")
out$RecordID <- trimws(sapply(strsplit(out$RecordID, ":"), "[[", 2))
# now that we have a RecordID column, remove these lines
out <- out[!find.record, ]
# split the data on colon and put into respective columns
out$variable <- trimws(sapply(strsplit(as.character(out$temp), ":"), "[[", 1))
out$value <- trimws(sapply(strsplit(as.character(out$temp), ":"), "[[", 2))
out$temp <- NULL # remove temporary data
out
RecordID variable value
2 Name1 VariableA 14
3 Name1 VariableB 34
4 Name1 VariableC 21
5 Name1 VariableD red
6 Name1 VariableE low
8 Name2 VariableA 31
9 Name2 VariableB 21
10 Name2 VariableD yellow
11 Name2 VariableE high
# transform from long to wide format
spread(out, key = variable, value = value)
RecordID VariableA VariableB VariableC VariableD VariableE
1 Name1 14 34 21 red low
2 Name2 31 21 <NA> yellow high
Upvotes: 1