Reputation: 59
I have a dataframe with the following format:
6/6/2019
No. sex ID weight(g)
1 f 0.001%Q 207
2 f 0.001%Q 198
3 f 0.001%Q 209
4 f 0.01%S 215
5 f 0.01%S 228
6 f 0.01%S 219
7 m 0.001%Q 303
8 m 0.001%Q 301
9 m 0.001%Q 319
10 m 0.01%S 303
11 m 0.01%S 305
12 m 0.01%S 295
6/7/2019
No. sex ID weight(g)
1 f 0.0025%Q 212
2 f 0.0025%Q 204
3 f 0.0025%Q 218
4 f 0.04%S 219
5 f 0.04%S 232
6 f 0.04%S 227
7 m 0.0025%Q 316
8 m 0.0025%Q 307
9 m 0.0025%Q 322
10 m 0.04%S 311
11 m 0.04%S 316
12 m 0.04%S 301
I would like to combine the two datasets, but make an additional column for the date of the dataset (to differentiate the two). Is there a way to remove the row containing the date and add it as a repeated value for an additional column the length of the dataset?
Upvotes: 0
Views: 74
Reputation: 21400
If your two dataframes look like these:
df1 <- data.frame(No. = 1:12,
sex = c(rep("f", 6), rep("m", 6)),
ID = c("0.0025%Q", "0.0025%Q", "0.0025%Q", "0.04%S",
"0.04%S", "0.04%S", "0.0025%Q", "0.0025%Q",
"0.0025%Q", "0.04%S", "0.04%S", "0.04%S"),
weight = c("207", "198", "209", "215", "228", "219",
"303", "301", "319", "303", "305", "295"))
df2 <- data.frame(No. = 1:12,
sex = c(rep("f", 6), rep("m", 6)),
ID = c("0.001%Q", "0.001%Q", "0.001%Q", "0.01%S",
"0.01%S", "0.01%S", "0.001%Q", "0.001%Q", "0.001%Q", "0.01%S",
"0.01%S", "0.01%S"),
weight = c("212", "204", "218", "219", "232", "227",
"316", "307", "322", "311", "316", "301"))
then the solution is straightforward: you take the date for each and replicate it the number of rows in each dataframe:
df1$date <- rep("6/6/2019", nrow(df1))
df2$date <- rep("6/7/2019", nrow(df2))
and row-bind the two data frames using rbind
:
df_combined <- rbind(df1, df2)
to obtain the combined dataframe:
df_combined
No. sex ID weight date
1 1 f 0.0025%Q 207 6/6/2019
2 2 f 0.0025%Q 198 6/6/2019
3 3 f 0.0025%Q 209 6/6/2019
4 4 f 0.04%S 215 6/6/2019
5 5 f 0.04%S 228 6/6/2019
6 6 f 0.04%S 219 6/6/2019
7 7 m 0.0025%Q 303 6/6/2019
8 8 m 0.0025%Q 301 6/6/2019
9 9 m 0.0025%Q 319 6/6/2019
10 10 m 0.04%S 303 6/6/2019
11 11 m 0.04%S 305 6/6/2019
12 12 m 0.04%S 295 6/6/2019
13 1 f 0.001%Q 212 6/7/2019
14 2 f 0.001%Q 204 6/7/2019
15 3 f 0.001%Q 218 6/7/2019
16 4 f 0.01%S 219 6/7/2019
17 5 f 0.01%S 232 6/7/2019
18 6 f 0.01%S 227 6/7/2019
19 7 m 0.001%Q 316 6/7/2019
20 8 m 0.001%Q 307 6/7/2019
21 9 m 0.001%Q 322 6/7/2019
22 10 m 0.01%S 311 6/7/2019
23 11 m 0.01%S 316 6/7/2019
24 12 m 0.01%S 301 6/7/2019
EDIT:
Since you do not have several but only a single dataframe, here's another solution. The first critical step is to read-in the file properly, meaning, in your case, to strip the white space behind the variable names; this is achieved by the argument strip.white = T
:
df <- read.table("Your path/Your dataframe.txt", header=F, strip.white = T, sep="\t", quote="", comment.char="")
The next step, equally critical, is to convert all the data to character:
df[,1:4] <- lapply(df[1:4], as.character)
Set all empty fields to "NA":
df[df==""] <- "NA"
Re-name the column names with the right values:
colnames(df) <- df[2,]
Grab the dates that you want to insert into a new column:
dates <- df[c(1,16),1]
Delete rows that have now become unnecessary:
df <- df[-c(1:2,15:17),]
And, finally, create the desired new column with the repeated dates:
df$date <- c(rep(dates[1], 12), rep(dates[2], 12))
The final result is this:
df
No. sex ID weight(g) date
3 1 f 0.001%Q 207 6/6/19
4 2 f 0.001%Q 198 6/6/19
5 3 f 0.001%Q 209 6/6/19
6 4 f 0.01%S 215 6/6/19
7 5 f 0.01%S 228 6/6/19
8 6 f 0.01%S 219 6/6/19
9 7 m 0.001%Q 303 6/6/19
10 8 m 0.001%Q 301 6/6/19
11 9 m 0.001%Q 319 6/6/19
12 10 m 0.01%S 303 6/6/19
13 11 m 0.01%S 305 6/6/19
14 12 m 0.01%S 295 6/6/19
18 1 f 0.0025%Q 212 6/7/19
19 2 f 0.0025%Q 204 6/7/19
20 3 f 0.0025%Q 218 6/7/19
21 4 f 0.04%S 219 6/7/19
22 5 f 0.04%S 232 6/7/19
23 6 f 0.04%S 227 6/7/19
24 7 m 0.0025%Q 316 6/7/19
25 8 m 0.0025%Q 307 6/7/19
26 9 m 0.0025%Q 322 6/7/19
27 10 m 0.04%S 311 6/7/19
28 11 m 0.04%S 316 6/7/19
29 12 m 0.04%S 301 6/7/19
Depending on how many internal divisions you have in your real dataframe, some of these steps might need some further refinement and automation.
Upvotes: 1
Reputation: 388862
Assuming you have the data as shown in the dput
below we can get the data in a list and for each dataframe do :
1) Give new names to the dataframe from the 2nd row
2) Add new column using the information from the 1st row
3) Remove first 2 rows
We then bind these datasets together.
list_df <- list(df1, df2)
do.call(rbind, c(lapply(list_df, function(df) {
names(df) <- df[2, ]
df$new_col <- na.omit(unlist(df[1, ]))
df <- df[-c(1, 2), ]
df
}), make.row.names = FALSE))
# No. sex ID weight(g) new_col
#1 1 f 0.001%Q 207 6/6/2019
#2 2 f 0.001%Q 198 6/6/2019
#3 3 f 0.001%Q 209 6/6/2019
#4 4 f 0.01%S 215 6/6/2019
#5 5 f 0.01%S 228 6/6/2019
#6 6 f 0.01%S 219 6/6/2019
#7 7 m 0.001%Q 303 6/6/2019
#8 8 m 0.001%Q 301 6/6/2019
#9 9 m 0.001%Q 319 6/6/2019
#10 10 m 0.01%S 303 6/6/2019
#11 11 m 0.01%S 305 6/6/2019
#12 12 m 0.01%S 295 6/6/2019
#13 1 f 0.0025%Q 212 6/7/2019
#14 2 f 0.0025%Q 204 6/7/2019
#15 3 f 0.0025%Q 218 6/7/2019
#16 4 f 0.04%S 219 6/7/2019
#17 5 f 0.04%S 232 6/7/2019
#18 6 f 0.04%S 227 6/7/2019
#19 7 m 0.0025%Q 316 6/7/2019
#20 8 m 0.0025%Q 307 6/7/2019
#21 9 m 0.0025%Q 322 6/7/2019
#22 10 m 0.04%S 311 6/7/2019
#23 11 m 0.04%S 316 6/7/2019
#24 12 m 0.04%S 301 6/7/2019
data
df1 <- structure(list(V1 = c("6/6/2019", "No.", "1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12"), V2 = c(NA, "sex",
"f", "f", "f", "f", "f", "f", "m", "m", "m", "m", "m", "m"),
V3 = c(NA, "ID", "0.001%Q", "0.001%Q", "0.001%Q", "0.01%S",
"0.01%S", "0.01%S", "0.001%Q", "0.001%Q", "0.001%Q", "0.01%S",
"0.01%S", "0.01%S"), V4 = c(NA, "weight(g)", "207", "198",
"209", "215", "228", "219", "303", "301", "319", "303", "305",
"295")), row.names = c(NA, -14L), class = "data.frame")
df2 <- structure(list(V1 = c("6/7/2019", "No.", "1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12"), V2 = c(NA, "sex",
"f", "f", "f", "f", "f", "f", "m", "m", "m", "m", "m", "m"),
V3 = c(NA, "ID", "0.0025%Q", "0.0025%Q", "0.0025%Q", "0.04%S",
"0.04%S", "0.04%S", "0.0025%Q", "0.0025%Q", "0.0025%Q", "0.04%S",
"0.04%S", "0.04%S"), V4 = c(NA, "weight(g)", "212", "204",
"218", "219", "232", "227", "316", "307", "322", "311", "316",
"301")), row.names = c(NA, -14L), class = "data.frame")
Upvotes: 2