Reputation: 317
I have a dataframe with around 1.5 million rows and 5 cols. One variable (VARIABLE) is of this type NATIONALITY_YEAR (e.g. SPAIN_1998) and I want to split it in two columns, one containing the Nationality, which is the left side of the name before the underscore, and one containing the Year, right side of the underscore. I have tried with concat.split which should be the easiest way:
aa <- concat.split(mydata, "VARIABLE", sep = "_", drop = F)
but after 2 hours running it did not produce any output. I am not sure if I should leave it running for a longer period of time or if there is a non time consuming way to do this.
Any help on the issue would be very much appreciated!
Here is a reproducible (subset!) sample:
mydata<- structure(list(PROVINCE = c(1L, 4L, 7L, 8L, 11L, 14L, 17L, 20L,
24L, 28L, 30L, 33L, 36L, 41L, 44L, 46L, 48L, 3L, 6L, 8L, 10L,
13L, 15L, 18L, 23L, 26L, 29L, 31L, 35L, 38L, 41L, 46L, 47L, 2L,
4L, 8L, 8L, 11L, 15L, 17L, 21L, 24L, 28L, 30L, 33L, 37L, 41L,
45L, 46L, 49L, 3L, 6L, 8L, 10L, 13L, 15L, 19L, 23L, 27L, 29L,
32L, 36L, 39L, 43L, 46L, 48L, 2L, 5L, 8L, 8L, 12L, 15L, 18L,
21L, 24L, 28L, 30L, 33L, 37L, 41L, 45L, 46L, 50L, 3L, 7L, 8L,
10L, 14L, 16L, 20L, 23L, 27L, 29L, 32L, 36L, 39L, 43L, 46L, 48L,
3L, 6L, 8L, 8L, 12L, 15L, 18L, 21L, 25L, 28L, 31L, 34L, 38L,
41L, 45L, 46L, 50L, 3L, 7L, 8L, 11L, 14L, 17L, 20L, 23L, 27L,
29L, 33L, 36L, 40L, 43L, 46L, 48L, 3L, 6L, 8L, 9L, 12L, 15L,
18L, 22L, 25L, 28L, 31L, 35L, 38L, 41L, 45L, 46L, 50L, 4L, 7L,
8L, 11L, 14L, 17L, 20L, 24L, 28L, 30L, 33L, 36L, 41L, 43L, 46L,
48L, 3L, 6L, 8L, 10L, 13L, 15L, 18L, 22L, 26L, 28L, 31L, 35L,
38L, 41L, 46L, 47L, 1L, 4L, 8L, 8L, 11L, 14L, 17L, 20L, 24L,
28L, 30L, 33L, 36L, 41L, 44L, 46L, 49L, 3L, 6L), AGE5 = structure(c(1L,
5L, 9L, 7L, 6L, 7L, 5L, 8L, 3L, 3L, 3L, 5L, 8L, 2L, 3L, 6L, 9L,
5L, 7L, 4L, 3L, 5L, 8L, 8L, 2L, 8L, 2L, 9L, 7L, 9L, 9L, 2L, 7L,
2L, 9L, 1L, 8L, 8L, 1L, 8L, 1L, 6L, 4L, 6L, 7L, 2L, 3L, 1L, 7L,
5L, 6L, 9L, 5L, 6L, 8L, 9L, 3L, 4L, 3L, 4L, 4L, 1L, 3L, 1L, 2L,
2L, 6L, 6L, 2L, 9L, 2L, 2L, 1L, 5L, 9L, 5L, 8L, 9L, 7L, 4L, 3L,
7L, 2L, 8L, 2L, 6L, 9L, 1L, 5L, 1L, 6L, 6L, 6L, 7L, 3L, 6L, 3L,
3L, 4L, 1L, 1L, 2L, 9L, 6L, 4L, 3L, 8L, 3L, 7L, 1L, 5L, 2L, 6L,
6L, 8L, 5L, 9L, 5L, 6L, 2L, 3L, 1L, 4L, 8L, 9L, 8L, 1L, 5L, 1L,
6L, 4L, 6L, 2L, 3L, 3L, 5L, 9L, 5L, 5L, 4L, 7L, 8L, 4L, 2L, 5L,
7L, 8L, 9L, 8L, 3L, 7L, 7L, 5L, 6L, 3L, 6L, 1L, 2L, 2L, 3L, 7L,
1L, 9L, 5L, 8L, 4L, 5L, 4L, 1L, 3L, 7L, 7L, 9L, 3L, 9L, 7L, 5L,
7L, 8L, 1L, 4L, 4L, 6L, 1L, 8L, 7L, 8L, 6L, 8L, 4L, 3L, 4L, 5L,
9L, 2L, 6L, 6L, 1L, 5L, 7L), .Label = c("10-14", "15-19", "20-24",
"25-29", "30-34", "35-39", "40-44", "45-49", "50-54"), class = "factor"),
ZONA91OK = c(101L, 4079L, 712L, 8205L, 11022L, 14021L, 1714L,
20067L, 2414L, 2810L, 300799L, 3305L, 36026L, 41024L, 4405L,
4607L, 48015L, 308L, 610L, 8121L, 1006L, 1307L, 1511L, 1813L,
2308L, 2605L, 2910L, 310799L, 35026L, 3811L, 411199L, 4601L,
4708L, 202L, 405L, 8015L, 837L, 11033L, 1502L, 1702L, 2112L,
2408L, 28047L, 30015L, 3305L, 3709L, 410199L, 4511L, 1202L,
490699L, 3063L, 610L, 827L, 1006L, 1301L, 15036L, 1901L,
2310L, 2709L, 29025L, 3201L, 36008L, 390899L, 4301L, 46184L,
4805L, 206L, 504L, 817L, 813L, 12135L, 1519L, 1810L, 2104L,
2402L, 28130L, 30030L, 3305L, 3707L, 411399L, 45165L, 46181L,
5008L, 305L, 7026L, 803L, 1006L, 1413L, 16078L, 200999L,
2312L, 2712L, 29069L, 3210L, 3616L, 391199L, 4313L, 46105L,
4805L, 310L, 6153L, 8252L, 8205L, 1205L, 1505L, 1808L, 2110L,
2508L, 2810L, 311399L, 3405L, 3807L, 41024L, 4507L, 46102L,
500599L, 3014L, 706L, 8121L, 11028L, 14042L, 1712L, 20045L,
2314L, 27031L, 29901L, 33024L, 3614L, 400199L, 4307L, 46021L,
4805L, 3066L, 6153L, 8015L, 901L, 12040L, 1522L, 1806L, 2203L,
2508L, 28047L, 311099L, 35004L, 3801L, 410199L, 4515L, 46017L,
501199L, 407L, 7027L, 827L, 1102L, 1404L, 17155L, 200599L,
24089L, 2812L, 30019L, 33024L, 3612L, 41038L, 4301L, 4628L,
4805L, 307L, 6153L, 817L, 1004L, 1309L, 1508L, 1804L, 2206L,
2606L, 28130L, 310799L, 35011L, 38022L, 411399L, 4622L, 4701L,
1036L, 4079L, 807L, 803L, 1108L, 1410L, 1708L, 201399L, 2410L,
28058L, 30043L, 33024L, 3610L, 410399L, 4401L, 4621L, 490499L,
3059L, 6153L), VARIABLE = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L), .Label = c("SPAIN_1998",
"EU15DC_1998", "ROE_1998", "MAGREB_1998", "SSA_1998", "LA_1998",
"ASIA_1998", "ROW_1998", "Total_1998", "SPAIN_1999", "EU15DC_1999",
"ROE_1999", "MAGREB_1999", "SSA_1999", "LA_1999", "ASIA_1999",
"ROW_1999", "Total_1999", "SPAIN_2000", "EU15DC_2000", "ROE_2000",
"MAGREB_2000", "SSA_2000", "LA_2000", "ASIA_2000", "ROW_2000",
"Total_2000", "SPAIN_2001", "EU15DC_2001", "ROE_2001", "MAGREB_2001",
"SSA_2001", "LA_2001", "ASIA_2001", "ROW_2001", "Total_2001",
"SPAIN_2002", "EU15DC_2002", "ROE_2002", "MAGREB_2002", "SSA_2002",
"LA_2002", "ASIA_2002", "ROW_2002", "Total_2002", "SPAIN_2003",
"EU15DC_2003", "ROE_2003", "MAGREB_2003", "SSA_2003", "LA_2003",
"ASIA_2003", "ROW_2003", "Total_2003", "SPAIN_2004", "EU15DC_2004",
"ROE_2004", "MAGREB_2004", "SSA_2004", "LA_2004", "ASIA_2004",
"ROW_2004", "Total_2004", "SPAIN_2005", "EU15DC_2005", "ROE_2005",
"MAGREB_2005", "SSA_2005", "LA_2005", "ASIA_2005", "ROW_2005",
"Total_2005", "SPAIN_2006", "EU15DC_2006", "ROE_2006", "MAGREB_2006",
"SSA_2006", "LA_2006", "ASIA_2006", "ROW_2006", "Total_2006",
"SPAIN_2007", "EU15DC_2007", "ROE_2007", "MAGREB_2007", "SSA_2007",
"LA_2007", "ASIA_2007", "ROW_2007", "Total_2007", "SPAIN_2008",
"EU15DC_2008", "ROE_2008", "MAGREB_2008", "SSA_2008", "LA_2008",
"ASIA_2008", "ROW_2008", "Total_2008", "SPAIN_2009", "EU15DC_2009",
"ROE_2009", "MAGREB_2009", "SSA_2009", "LA_2009", "ASIA_2009",
"ROW_2009", "Total_2009", "SPAIN_2010", "EU15DC_2010", "ROE_2010",
"MAGREB_2010", "SSA_2010", "LA_2010", "ASIA_2010", "ROW_2010",
"Total_2010", "SPAIN_2011", "EU15DC_2011", "ROE_2011", "MAGREB_2011",
"SSA_2011", "LA_2011", "ASIA_2011", "ROW_2011", "Total_2011",
"SPAIN_2012", "EU15DC_2012", "ROE_2012", "MAGREB_2012", "SSA_2012",
"LA_2012", "ASIA_2012", "ROW_2012", "Total_2012", "NOTSPAIN_1998",
"NOTSPAIN_1999", "NOTSPAIN_2000", "NOTSPAIN_2001", "NOTSPAIN_2002",
"NOTSPAIN_2003", "NOTSPAIN_2004", "NOTSPAIN_2005", "NOTSPAIN_2006",
"NOTSPAIN_2007", "NOTSPAIN_2008", "NOTSPAIN_2009", "NOTSPAIN_2010",
"NOTSPAIN_2011", "NOTSPAIN_2012", "AFRICA_1998", "AFRICA_1999",
"AFRICA_2000", "AFRICA_2001", "AFRICA_2002", "AFRICA_2003",
"AFRICA_2004", "AFRICA_2005", "AFRICA_2006", "AFRICA_2007",
"AFRICA_2008", "AFRICA_2009", "AFRICA_2010", "AFRICA_2011",
"AFRICA_2012", "DWC_1998", "DWC_1999", "DWC_2000", "DWC_2001",
"DWC_2002", "DWC_2003", "DWC_2004", "DWC_2005", "DWC_2006",
"DWC_2007", "DWC_2008", "DWC_2009", "DWC_2010", "DWC_2011",
"DWC_2012"), class = "factor"), FREQUENCY = c(614, 1943,
59, 201, 188, 10859, 93,
1494, 60, 1001, 1000, 689, 675, 934, 51,
1240, 165, 13, 0, 14, 2, 2,
2, 0, 3, 0, 40, 1, 18, 41, 1, 0, 3, 0, 0, 0, 1, 0,
0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 80, 0,
0, 0, 4, 0, 0, 15, 0, 0, 1, 1, 3, 4, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 2, 11, 0, 0, 0, 3, 2, 1, 5,
64, 1, 4, 1, 3, 4, 8, 1, 1, 1, 1, 0, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 2173, 907, 9059, 839,
4303, 100, 1727, 663, 694, 1210, 623,
1261, 772, 697, 490, 1031, 490, 956, 704,
1293, 1011, 739, 927, 755, 3340, 1190, 1254, 12880, 528,
3244, 277, 892, 837, 1, 2, 10, 1, 1, 2, 2, 0, 0, 1, 8, 3,
12, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 12, 0, 7, 0, 0, 0,
0, 0, 5, 2)), .Names = c("PROVINCE", "AGE5", "ZONA91OK",
"VARIABLE", "FREQUENCY"), row.names = c(1L, 501L, 1001L, 1501L,
2001L, 2501L, 3001L, 3501L, 4001L, 4501L, 5001L, 5501L, 6001L,
6501L, 7001L, 7501L, 8001L, 8501L, 9001L, 9501L, 10001L, 10501L,
11001L, 11501L, 12001L, 12501L, 13001L, 13501L, 14001L, 14501L,
15001L, 15501L, 16001L, 16501L, 17001L, 17501L, 18001L, 18501L,
19001L, 19501L, 20001L, 20501L, 21001L, 21501L, 22001L, 22501L,
23001L, 23501L, 24001L, 24501L, 25001L, 25501L, 26001L, 26501L,
27001L, 27501L, 28001L, 28501L, 29001L, 29501L, 30001L, 30501L,
31001L, 31501L, 32001L, 32501L, 33001L, 33501L, 34001L, 34501L,
35001L, 35501L, 36001L, 36501L, 37001L, 37501L, 38001L, 38501L,
39001L, 39501L, 40001L, 40501L, 41001L, 41501L, 42001L, 42501L,
43001L, 43501L, 44001L, 44501L, 45001L, 45501L, 46001L, 46501L,
47001L, 47501L, 48001L, 48501L, 49001L, 49501L, 50001L, 50501L,
51001L, 51501L, 52001L, 52501L, 53001L, 53501L, 54001L, 54501L,
55001L, 55501L, 56001L, 56501L, 57001L, 57501L, 58001L, 58501L,
59001L, 59501L, 60001L, 60501L, 61001L, 61501L, 62001L, 62501L,
63001L, 63501L, 64001L, 64501L, 65001L, 65501L, 66001L, 66501L,
67001L, 67501L, 68001L, 68501L, 69001L, 69501L, 70001L, 70501L,
71001L, 71501L, 72001L, 72501L, 73001L, 73501L, 74001L, 74501L,
75001L, 75501L, 76001L, 76501L, 77001L, 77501L, 78001L, 78501L,
79001L, 79501L, 80001L, 80501L, 81001L, 81501L, 82001L, 82501L,
83001L, 83501L, 84001L, 84501L, 85001L, 85501L, 86001L, 86501L,
87001L, 87501L, 88001L, 88501L, 89001L, 89501L, 90001L, 90501L,
91001L, 91501L, 92001L, 92501L, 93001L, 93501L, 94001L, 94501L,
95001L, 95501L, 96001L, 96501L, 97001L, 97501L, 98001L, 98501L,
99001L, 99501L), class = "data.frame")
Upvotes: 3
Views: 792
Reputation: 193527
It seems like I need to look into updating my concat.split
functions!
The version of the function that you tried to use makes use of read.table
, which does tend to struggle with large datasets. I had used read.table
because it has a convenient text
argument that lets you specify a column in a data.frame
as the input. This is really convenient when working with small-ish datasets, but evidently not with larger ones :)
As far as I can tell, fread
from the "data.table" package doesn't have a similar feature, but since R tends to write files pretty quickly, I thought that it would be worth trying a similar approach as what I used in concat.split
with fread
instead of read.table
.
Here's the concept:
fread
to read it back in.fread
to get a text
argument somewhere down the line?Here's that concept as a function (updated with edits as per @eddi's suggestions in the comments):
csDataTable <- function(dataset, splitcol, sep, drop = FALSE) {
if (is.numeric(splitcol)) splitcol <- names(dataset)[splitcol]
if (!is.data.table(dataset)) dataset <- data.table(dataset)
if (sep == ".") {
dataset[, (splitcol) := gsub(".", "|", get(splitcol), fixed = TRUE)]
sep <- "|"
}
if (!is.character(dataset[[splitcol]])) {
dataset[, (splitcol) := as.character(get(splitcol))]
}
x <- tempfile()
writeLines(dataset[[splitcol]], x)
Split <- fread(x, sep=sep, header = FALSE)
setnames(Split, paste(splitcol, seq_along(Split), sep = "_"))
if (isTRUE(drop)) dataset[, (splitcol) := NULL]
cbind(dataset, Split)
}
Here's the function in action:
## Expand your sample data to 1.5 million rows to test
out <- mydata[rep(rownames(mydata), 1500000/nrow(mydata)), ]
csDataTable(out, "VARIABLE", "_")
# PROVINCE AGE5 ZONA91OK VARIABLE FREQUENCY VARIABLE_1 VARIABLE_2
# 1: 1 10-14 101 SPAIN_1998 614 SPAIN 1998
# 2: 4 30-34 4079 SPAIN_1998 1943 SPAIN 1998
# 3: 7 50-54 712 SPAIN_1998 59 SPAIN 1998
# 4: 8 40-44 8205 SPAIN_1998 201 SPAIN 1998
# 5: 11 35-39 11022 SPAIN_1998 188 SPAIN 1998
# ---
# 1499996: 44 35-39 4401 ROE_1999 0 ROE 1999
# 1499997: 46 35-39 4621 ROE_1999 0 ROE 1999
# 1499998: 49 10-14 490499 ROE_1999 0 ROE 1999
# 1499999: 3 30-34 3059 MAGREB_1999 5 MAGREB 1999
# 1500000: 6 40-44 6153 MAGREB_1999 2 MAGREB 1999
In this test, at least, the solution fares much better than I expected:
subFun <- function() {
dt = data.table(out)
dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
YEAR = sub('(.*)_(.*)', '\\2', VARIABLE))]
}
freadFun <- function() {
csDataTable(out, "VARIABLE", "_")
}
library(microbenchmark)
microbenchmark(subFun(), freadFun(), times = 20)
# Unit: seconds
# expr min lq median uq max neval
# subFun() 3.814174 4.244820 4.273834 4.345358 4.480520 20
# freadFun() 1.356533 2.064262 2.152159 2.226465 2.300886 20
Upvotes: 6
Reputation: 7561
Here is some solution with splitting factor labels
VARIABLE_LEVELS <- cbind("VARIABLE"=levels(mydata$VARIABLE),
as.data.frame(do.call("rbind",
strsplit(levels(mydata$VARIABLE), split="_")))
mydata <- merge(mydata, VARIABLE_LEVELS)
#
# Insted of merege you can use VARIABLE (in mydata) as index
#
mydata <- cbind(mydata, VARIABLE_LEVELS[as.integer(mydata$VARIABLE),c("V1","V2")])
Upvotes: 3
Reputation: 49448
Try this instead:
library(data.table)
dt = data.table(mydata)
dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
YEAR = sub('(.*)_(.*)', '\\2', VARIABLE))]
Upvotes: 7