HerrSubset
HerrSubset

Reputation: 31

R function keeps returning empty data frame

I'm trying to write an R function that loops through a given dataframe to filter it a bit. The data in the dataframe consists of travel information between two lines in the London subway an I'd like to cut off the top percent. Here's the output of the str() function for the input data:

'data.frame':   71748 obs. of  9 variables:
$ depart       : Factor w/ 52 levels "Bank","Barkingside",..: 22 22 22 22 22 25 25 25 25 25 ...
$ arrival      : Factor w/ 48 levels "Bank","Barkingside",..: 48 43 38 5 8 1 42 48 41 43 ...
$ traveltime   : num  433 1102 161 584 891 ...
$ departuretime: POSIXlt, format: "2014-03-24 18:17:20" "2014-03-24 18:17:20" "2014-03-24 18:17:20" ...
$ arrivaltime  : POSIXlt, format: "2014-03-24 18:24:33" "2014-03-24 18:35:42" "2014-03-24 18:20:01" ...
$ lcid         : Factor w/ 28 levels "1000001","1000002",..: 1 1 1 1 1 1 1 1 1 1 ...
$ tripno       : Factor w/ 25 levels "1","10","11",..: 2 2 2 2 2 2 2 2 2 2 ...
$ destination  : Factor w/ 18 levels "Debden","Ealing Broadway",..: 3 3 3 3 3 3 3 3 3 3 ...
$ line         : Factor w/ 1 level "C": 1 1 1 1 1 1 1 1 1 1 ...

Here's the functions I wrote:

#cut off top percent of travel times for each combination of arrival and
#departure stations to remove outliers
cutOffTopPercent <- function(data, percentage=0.99){

  res <- data.frame()

  #loop through all combinations of depart and arrival stations
  for(i in 1:length(levels(data$depart))){
    for(j in 1:length(levels(data$arrival))){

      #create variables for departure/arrival station to make code easier to read
      departureStation <- levels(data$depart)[i]
      arrivalStation <- levels(data$arrival)[j]

      #create a subset containing only the current departure and arrival station
      dataSubset <- data[data$depart == departureStation & data$arrival == arrivalStation,]

      #get top value that's allowed
      upperBorder <- getTopPercentileBottom(dataSubset, percentage)
      #remove records with values higher than than allowed
      dataSubset <- dataSubset[dataSubset$traveltime < upperBorder,]

      #glue the subset to the end result
      res <- rbind(res,dataSubset)
      }
  }

  return(res)
}

#returns the traveltime that marks where the given percentage of traveltimes starts
getTopPercentileBottom <- function(data, percentile){
  upperBorder <- quantile(data$traveltime, probs = percentile)
  return(upperBorder)
}

The cutOffTopPercent() function always returns an empty data frame however. I can't find my error. I've been trying to go to the steps manually, but when I do so, all the data subsets get appended to the res dataframe correctly.

Can anyone see what I did wrong, or suggest a better approach to what I'm trying to do?

EDIT:

a dput of the first 30 records in my input data:

structure(list(depart = structure(c(22L, 22L, 22L, 22L, 22L, 
25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 
25L, 25L, 25L, 25L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L), .Label = c("Bank", 
"Barkingside", "Bethnal Green", "Bond Street", "Buckhurst Hill", 
"Chancery Lane", "Chigwell", "Debden", "Ealing Broadway", "East Acton", 
"Epping", "Fairlop", "Gants Hill", "Grange Hill", "Greenford", 
"Hainault", "Hanger Lane", "Holborn", "Holland Park", "Lancaster Gate", 
"Leyton", "Leytonstone", "Liverpool Street", "Loughton", "Marble Arch", 
"Mile End", "Newbury Park", "Newbury Park Loop", "North Acton", 
"North Acton Junction", "Northolt", "Notting Hill Gate", "Oxford Circus", 
"Perivale", "Queensway", "Redbridge", "Roding Valley", "Ruislip Gardens", 
"Shepherd's Bush", "Shepherds Bush (Central Line)", "Snaresbrook", 
"South Ruislip", "South Woodford", "St. Paul's", "Stratford", 
"Theydon Bois", "Tottenham Court Road", "Wanstead", "West Acton", 
"West Ruislip", "White City", "Woodford"), class = "factor"), 
    arrival = structure(c(48L, 43L, 38L, 5L, 8L, 1L, 42L, 48L, 
    41L, 43L, 6L, 38L, 5L, 4L, 16L, 30L, 44L, 20L, 8L, 3L, 24L, 
    19L, 1L, 42L, 48L, 41L, 43L, 6L, 38L, 5L), .Label = c("Bank", 
    "Barkingside", "Bethnal Green", "Bond Street", "Buckhurst Hill", 
    "Chancery Lane", "Chigwell", "Debden", "East Acton", "Fairlop", 
    "Gants Hill", "Grange Hill", "Greenford", "Hainault", "Hanger Lane", 
    "Holborn", "Holland Park", "Lancaster Gate", "Leyton", "Leytonstone", 
    "Liverpool Street", "Loughton", "Marble Arch", "Mile End", 
    "Newbury Park", "North Acton", "North Acton Junction", "Northolt", 
    "Notting Hill Gate", "Oxford Circus", "Perivale", "Queensway", 
    "Redbridge", "Roding Valley", "Ruislip Gardens", "Shepherd's Bush", 
    "Shepherds Bush (Central Line)", "Snaresbrook", "South Ruislip", 
    "South Woodford", "St. Paul's", "Stratford", "Theydon Bois", 
    "Tottenham Court Road", "Wanstead", "West Acton", "White City", 
    "Woodford"), class = "factor"), traveltime = c(433, 1102, 
    161, 584, 891, 829, 1473, 2273, 629, 2942, 467, 2001, 2424, 
    75, 351, 165, 249, 1840, 2731, 1148, 1289, 1653, 580, 1224, 
    2024, 380, 2693, 218, 1752, 2175), departuretime = structure(list(
        sec = c(20, 20, 20, 20, 20, 40, 40, 40, 40, 40, 40, 40, 
        40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 49, 49, 49, 49, 
        49, 49, 49, 49), min = c(17L, 17L, 17L, 17L, 17L, 46L, 
        46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 
        46L, 46L, 46L, 46L, 46L, 50L, 50L, 50L, 50L, 50L, 50L, 
        50L, 50L), hour = c(18L, 18L, 18L, 18L, 18L, 17L, 17L, 
        17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 
        17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 
        17L), mday = c(24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 
        24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 
        24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L
        ), mon = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
        2L, 2L, 2L, 2L, 2L), year = c(114L, 114L, 114L, 114L, 
        114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 
        114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 
        114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L), wday = c(1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L), yday = c(82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 
        82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 
        82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L
        ), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
        0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
        0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("sec", "min", "hour", 
    "mday", "mon", "year", "wday", "yday", "isdst"), class = c("POSIXlt", 
    "POSIXt"), tzone = "GMT"), arrivaltime = structure(list(sec = c(33, 
    42, 1, 4, 11, 29, 13, 33, 9, 42, 27, 1, 4, 55, 31, 25, 49, 
    20, 11, 48, 9, 13, 29, 13, 33, 9, 42, 27, 1, 4), min = c(24L, 
    35L, 20L, 27L, 32L, 0L, 11L, 24L, 57L, 35L, 54L, 20L, 27L, 
    47L, 52L, 49L, 50L, 17L, 32L, 5L, 8L, 14L, 0L, 11L, 24L, 
    57L, 35L, 54L, 20L, 27L), hour = c(18L, 18L, 18L, 18L, 18L, 
    18L, 18L, 18L, 17L, 18L, 17L, 18L, 18L, 17L, 17L, 17L, 17L, 
    18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 17L, 18L, 17L, 18L, 
    18L), mday = c(24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 
    24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 
    24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L), mon = c(2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), 
        year = c(114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 
        114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 
        114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 
        114L, 114L, 114L, 114L), wday = c(1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), yday = c(82L, 
        82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 
        82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 
        82L, 82L, 82L, 82L, 82L, 82L, 82L), isdst = c(0L, 0L, 
        0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
        0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
        )), .Names = c("sec", "min", "hour", "mday", "mon", "year", 
    "wday", "yday", "isdst"), class = c("POSIXlt", "POSIXt"), tzone = "GMT"), 
    lcid = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("1000001", "1000002", "1000003", 
    "1000004", "1000005", "1000006", "1000007", "1000008", "1000009", 
    "1000010", "1000045", "1000054", "1000070", "1000088", "1000089", 
    "1000090", "1000097", "1000098", "1000099", "1000100", "1000101", 
    "1000102", "1000103", "1000104", "1000105", "1000106", "1000107", 
    "1000109"), class = "factor"), tripno = structure(c(2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", 
    "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", 
    "2", "20", "21", "22", "23", "24", "3", "4", "5", "6", "7", 
    "8", "81", "9"), class = "factor"), destination = structure(c(3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Debden", 
    "Ealing Broadway", "Epping", "Grange Hill via Woodford", 
    "Hainault", "Hainault via Newbury Park", "Hainault via Woodford", 
    "Leytonstone", "Loughton", "Marble Arch", "Newbury Park", 
    "North Acton", "Northolt", "Ruislip Gardens", "West Ruislip", 
    "White City", "Woodford", "Woodford Via Hainault"), class = "factor"), 
    line = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "C", class = "factor")), .Names = c("depart", 
"arrival", "traveltime", "departuretime", "arrivaltime", "lcid", 
"tripno", "destination", "line"), row.names = c(NA, 30L), class = "data.frame")

Upvotes: 3

Views: 1165

Answers (1)

agstudy
agstudy

Reputation: 121568

Here a vectorized version of your code. Basically I used Map to avoid double loops and filling the result manullay (using rbind, very solw).

cutOffTopPercent <- 
  function(data,percent=0.99){
  cut_off_dep_arr <- 
  function(dep,arr){
    dataSubset <- data[data$depart == dep & data$arrival == arr,]
    upperBorder <- getTopPercentileBottom(dataSubset, percent)
    dataSubset[dataSubset$traveltime <= upperBorder,]  ##  <= not <
  }
  Map(cut_off_dep_arr,df$depart,df$arrival)
}

cutOffTopPercent(data=df)

Upvotes: 1

Related Questions