user8959427
user8959427

Reputation: 2067

Where am I going wrong in spliting time series?

data<-c(10.0,11.1,12.3,13.2,14.8,15.6,16.7,17.5,18.9,19.7,20.7,21.1,22.6,23.5,24.9,25.1,26.3,27.8,28.8,29.6,30.2,31.6,32.1,33.7)
startDate <- '2013-01-01'
endDate <- '2013-01-01'


df <- ts(cbind(data, startDate, endDate))
df


################

smp_size <- 0.80
train_ind <- length(df) * smp_size

train_split <- seq(from = 1, to = train_ind)
test_split <- seq(from = train_ind +1, to = length(df))

train <- data[train_split]
test <- data[-test_split]

(c(train, test))

I have the above data and I am trying to split it into time series splits, i..e the first 80% as training and the remaining 20% as testing.

I keep getting weird results:

(c(train, test))
 [1] 10.0 11.1 12.3 13.2 14.8 15.6 16.7 17.5 18.9 19.7 20.7 21.1 22.6 23.5 24.9 25.1 26.3 27.8 28.8 29.6 30.2
[22] 31.6 32.1 33.7   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
[43]   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 10.0 11.1 12.3 13.2 14.8 15.6
[64] 16.7 17.5 18.9 19.7 20.7 21.1 22.6 23.5 24.9 25.1 26.3 27.8 28.8 29.6 30.2 31.6 32.1 33.7

Why are there NA values in the middle of the data?

Upvotes: 0

Views: 34

Answers (2)

Ronak Shah
Ronak Shah

Reputation: 389047

Calculate the number of rows to include in test set and use window function to subset time-series

train_size <- ceiling(nrow(df) * 0.8)
train_set <- window(df, end = train_size)
test_set <- window(df, start = train_size + 1)
train_set

#Time Series:
#Start = 1 
#End = 20 
#Frequency = 1 
#   data  startDate    endDate
# 1   10 2013-01-01 2013-01-01
# 2 11.1 2013-01-01 2013-01-01
# 3 12.3 2013-01-01 2013-01-01
# 4 13.2 2013-01-01 2013-01-01
# 5 14.8 2013-01-01 2013-01-01
# 6 15.6 2013-01-01 2013-01-01
# 7 16.7 2013-01-01 2013-01-01
# 8 17.5 2013-01-01 2013-01-01
# 9 18.9 2013-01-01 2013-01-01
#10 19.7 2013-01-01 2013-01-01
#11 20.7 2013-01-01 2013-01-01
#12 21.1 2013-01-01 2013-01-01
#13 22.6 2013-01-01 2013-01-01
#14 23.5 2013-01-01 2013-01-01
#15 24.9 2013-01-01 2013-01-01
#16 25.1 2013-01-01 2013-01-01
#17 26.3 2013-01-01 2013-01-01
#18 27.8 2013-01-01 2013-01-01
#19 28.8 2013-01-01 2013-01-01
#20 29.6 2013-01-01 2013-01-01

test_set
#Time Series:
#Start = 21 
#End = 24 
#Frequency = 1 
#   data  startDate    endDate
#21 30.2 2013-01-01 2013-01-01
#22 31.6 2013-01-01 2013-01-01
#23 32.1 2013-01-01 2013-01-01
#24 33.7 2013-01-01 2013-01-01

Upvotes: 1

Roman
Roman

Reputation: 4989

You should use nrow(df), not length(df) for time-series objects.

data <- c(10.0, 11.1, 12.3, 13.2, 14.8, 15.6, 16.7, 17.5, 18.9,
          19.7, 20.7, 21.1, 22.6, 23.5, 24.9, 25.1, 26.3, 27.8, 
          28.8, 29.6, 30.2, 31.6, 32.1, 33.7)
startDate <- '2013-01-01'
endDate <- '2013-01-01'

df <- ts(cbind(data, startDate, endDate))

train <- df[1:(nrow(df) * .8), ]
test <- df[-(1:(nrow(df) * .8)), ]

> all.equal(df, ts(rbind(train, test)))
[1] TRUE
> length(df) 
[1] 72
> nrow(df)
[1] 24

Upvotes: 2

Related Questions