Reputation: 196
I have a big data CSV file. I managed to merge them into one CSV file. Now when I read the file its is not consistent inflow. For example :
file1 CSV: id date
21 2001
21 2002
21 2003
22 2001
22 2001
22 2002
file2 CSV: 21 2006
21 2005
21 2007
22 2006
22 2006
and so on. Now when I merge all files in one CSV i want output like
id date
21 2001
21 2002
21 2003
21 2006
21 2005
21 2007
22 2001
22 2001
22 2002
22 2006
22 2006
Code:
All <- lapply(filenames_list,function(filename){
print(paste("Merging",filename,sep = " "))
read.csv(filename)
})
df <- data.frame(do.call(rbind.data.frame, All))
write.csv(df,merge_file_name)
this is the code I did to merge all files in one CSV. Kindly help me to keep them in an organized pattern.
dput(head(newdata1,10))
structure(list(X = 1:10, gvkey = c(7041L, 7041L, 7041L, 7041L,
7041L, 7041L, 7041L, 7041L, 7041L, 7041L), datadate = c(20000103L,
20000104L, 20000105L, 20000106L, 20000111L, 20000112L, 20000113L,
20000114L, 20000117L, 20000118L), cshoc = c(4.2e+07, 4.2e+07,
4.2e+07, 4.2e+07, 4.2e+07, 4.2e+07, 4.2e+07, 4.2e+07, 4.2e+07,
4.2e+07), cshtrd = c(112000, 637000, 241000, 251000, 224000,
194000, 175000, 217000, 307000, 326000), prccd = c(3.86, 4.28,
4, 4.04, 4, 3.96, 3.92, 3.96, 4.06, 4.14), prchd = c(3.86, 4.6,
4.22, 4.26, 4.26, 4.04, 4.1, 4.02, 4.2, 4.3), prcld = c(3.3,
3.86, 3.9, 3.96, 4, 3.9, 3.92, 3.88, 4, 4.02), prcstd = c(10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L), qunit = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), cheqv = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), cheqvgross = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
div = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), divd = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), divdgross = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), divdnet = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), divdtm = c("", "", "", "", "", "", "", "", "",
""), divgross = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), divnet = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), divrc = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), divrcgross = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), divrcnet = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), divsp = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), divspgross = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), divspnet = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), divsptm = c("",
"", "", "", "", "", "", "", "", ""), anncdate = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), cheqvpaydate = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), divdpaydate = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), divrcpaydate = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), divsppaydate = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), paydate = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), recorddate = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), split = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), splitf = c("", "", "", "",
"", "", "", "", "", ""), trfd = c(1.05474854, 1.05474854,
1.05474854, 1.05474854, 1.05474854, 1.05474854, 1.05474854,
1.05474854, 1.05474854, 1.05474854), monthend = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), fyrc = c(3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), ggroup = c(2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L), gind = c(201070L,
201070L, 201070L, 201070L, 201070L, 201070L, 201070L, 201070L,
201070L, 201070L), gsector = c(20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L), gsubind = c(20107010L, 20107010L, 20107010L,
20107010L, 20107010L, 20107010L, 20107010L, 20107010L, 20107010L,
20107010L), naics = c(999990L, 999990L, 999990L, 999990L,
999990L, 999990L, 999990L, 999990L, 999990L, 999990L), sic = c(9995L,
9995L, 9995L, 9995L, 9995L, 9995L, 9995L, 9995L, 9995L, 9995L
), spcindcd = c(400L, 400L, 400L, 400L, 400L, 400L, 400L,
400L, 400L, 400L), spcseccd = c(970L, 970L, 970L, 970L, 970L,
970L, 970L, 970L, 970L, 970L), ipodate = c(19920103L, 19920103L,
19920103L, 19920103L, 19920103L, 19920103L, 19920103L, 19920103L,
19920103L, 19920103L)), row.names = c(NA, 10L), class = "data.frame")
Upvotes: 0
Views: 99
Reputation: 76
If I understand your ideal outcome correctly, you want to order the data.frame df by the column ID and then save this as a csv file. If so, in your code you can add a line using order() and sort by id in ascending order and then save that data frame. By putting order inside the square brackets [] it retains the rest of your data. By putting order before the comma , inside the brackets, the data becomes ordered by the row values.
Instead of:
df <- data.frame(do.call(rbind.data.frame, All))
write.csv(df,merge_file_name)
Add the line with order:
df <- data.frame(do.call(rbind.data.frame, All))
new_df <- df[order(df$id),]
write.csv(new_df,merge_file_name)
For a full example:
# Create two data frames
df1 <- data.frame(gvkey = c(1:6),
dat = runif(6))
df2 <- data.frame(gvkey = c(1:6),
dat = runif(6))
# bind them by the rows
df <- rbind(df1, df2)
# order by gvkey
new_df <- df[order(df$gvkey),]
Upvotes: 1