Reputation: 2067
data<-c(10.0,11.1,12.3,13.2,14.8,15.6,16.7,17.5,18.9,19.7,20.7,21.1,22.6,23.5,24.9,25.1,26.3,27.8,28.8,29.6,30.2,31.6,32.1,33.7)
startDate <- '2013-01-01'
endDate <- '2013-01-01'
df <- ts(cbind(data, startDate, endDate))
df
################
smp_size <- 0.80
train_ind <- length(df) * smp_size
train_split <- seq(from = 1, to = train_ind)
test_split <- seq(from = train_ind +1, to = length(df))
train <- data[train_split]
test <- data[-test_split]
(c(train, test))
I have the above data and I am trying to split it into time series splits, i..e the first 80% as training and the remaining 20% as testing.
I keep getting weird results:
(c(train, test))
[1] 10.0 11.1 12.3 13.2 14.8 15.6 16.7 17.5 18.9 19.7 20.7 21.1 22.6 23.5 24.9 25.1 26.3 27.8 28.8 29.6 30.2
[22] 31.6 32.1 33.7 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[43] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 10.0 11.1 12.3 13.2 14.8 15.6
[64] 16.7 17.5 18.9 19.7 20.7 21.1 22.6 23.5 24.9 25.1 26.3 27.8 28.8 29.6 30.2 31.6 32.1 33.7
Why are there NA
values in the middle of the data?
Upvotes: 0
Views: 34
Reputation: 389047
Calculate the number of rows to include in test set and use window
function to subset time-series
train_size <- ceiling(nrow(df) * 0.8)
train_set <- window(df, end = train_size)
test_set <- window(df, start = train_size + 1)
train_set
#Time Series:
#Start = 1
#End = 20
#Frequency = 1
# data startDate endDate
# 1 10 2013-01-01 2013-01-01
# 2 11.1 2013-01-01 2013-01-01
# 3 12.3 2013-01-01 2013-01-01
# 4 13.2 2013-01-01 2013-01-01
# 5 14.8 2013-01-01 2013-01-01
# 6 15.6 2013-01-01 2013-01-01
# 7 16.7 2013-01-01 2013-01-01
# 8 17.5 2013-01-01 2013-01-01
# 9 18.9 2013-01-01 2013-01-01
#10 19.7 2013-01-01 2013-01-01
#11 20.7 2013-01-01 2013-01-01
#12 21.1 2013-01-01 2013-01-01
#13 22.6 2013-01-01 2013-01-01
#14 23.5 2013-01-01 2013-01-01
#15 24.9 2013-01-01 2013-01-01
#16 25.1 2013-01-01 2013-01-01
#17 26.3 2013-01-01 2013-01-01
#18 27.8 2013-01-01 2013-01-01
#19 28.8 2013-01-01 2013-01-01
#20 29.6 2013-01-01 2013-01-01
test_set
#Time Series:
#Start = 21
#End = 24
#Frequency = 1
# data startDate endDate
#21 30.2 2013-01-01 2013-01-01
#22 31.6 2013-01-01 2013-01-01
#23 32.1 2013-01-01 2013-01-01
#24 33.7 2013-01-01 2013-01-01
Upvotes: 1
Reputation: 4989
You should use nrow(df)
, not length(df)
for time-series objects.
data <- c(10.0, 11.1, 12.3, 13.2, 14.8, 15.6, 16.7, 17.5, 18.9,
19.7, 20.7, 21.1, 22.6, 23.5, 24.9, 25.1, 26.3, 27.8,
28.8, 29.6, 30.2, 31.6, 32.1, 33.7)
startDate <- '2013-01-01'
endDate <- '2013-01-01'
df <- ts(cbind(data, startDate, endDate))
train <- df[1:(nrow(df) * .8), ]
test <- df[-(1:(nrow(df) * .8)), ]
> all.equal(df, ts(rbind(train, test)))
[1] TRUE
> length(df)
[1] 72
> nrow(df)
[1] 24
Upvotes: 2