cherrytree
cherrytree

Reputation: 1581

odd behavior with select in dplyr

I'm experiencing odd behavior with the select function of dplyr. It is not dropping the variable from the data frame.

Here is the original data:

orig <- structure(list(park = structure(c(4L, 4L, 4L, 4L, 4L), .Label = c("miss", 
"piro", "sacn", "slbe"), class = "factor"), year = c(2006L, 2009L, 
2006L, 2008L, 2009L), agent = structure(c(5L, 5L, 5L, 7L, 5L), .Label = c("agriculture", 
"beaver", "development", "flooding", "forest_pathogen", "harvest_00_20", 
"harvest_30_60", "harvest_70_90", "none"), class = "factor"), 
    ha = c(4.32, 1.17, 3.51, 2.07, 9.18), loc_01 = structure(c(9L, 
    5L, 9L, 5L, 5L), .Label = c("miss", "non_miss", "non_piro", 
    "non_sacn", "non_slbe", "none", "piro", "sacn", "slbe"), class = "factor"), 
    loc_02 = structure(c(5L, 1L, 5L, 1L, 1L), .Label = c("none", 
    "piro_core", "piro_ibz", "slbe_mainland", "slbe_southmanitou"
    ), class = "factor"), loc_03 = structure(c(1L, 1L, 1L, 1L, 
    1L), .Label = "none", class = "factor"), cross_valid = c(1L, 
    1L, 1L, 1L, 1L)), .Names = c("park", "year", "agent", "ha", 
"loc_01", "loc_02", "loc_03", "cross_valid"), row.names = c(NA, 
5L), class = "data.frame")

Looks like:

> orig
  park year           agent   ha   loc_01            loc_02 loc_03 cross_valid
1 slbe 2006 forest_pathogen 4.32     slbe slbe_southmanitou   none           1
2 slbe 2009 forest_pathogen 1.17 non_slbe              none   none           1
3 slbe 2006 forest_pathogen 3.51     slbe slbe_southmanitou   none           1
4 slbe 2008   harvest_30_60 2.07 non_slbe              none   none           1
5 slbe 2009 forest_pathogen 9.18 non_slbe              none   none           1
> str(orig)
'data.frame':   5 obs. of  8 variables:
 $ park       : Factor w/ 4 levels "miss","piro",..: 4 4 4 4 4
 $ year       : int  2006 2009 2006 2008 2009
 $ agent      : Factor w/ 9 levels "agriculture",..: 5 5 5 7 5
 $ ha         : num  4.32 1.17 3.51 2.07 9.18
 $ loc_01     : Factor w/ 9 levels "miss","non_miss",..: 9 5 9 5 5
 $ loc_02     : Factor w/ 5 levels "none","piro_core",..: 5 1 5 1 1
 $ loc_03     : Factor w/ 1 level "none": 1 1 1 1 1
 $ cross_valid: int  1 1 1 1 1

Then I do a little summary...

    library (dplyr)
    summ <- orig %>%
    + group_by(park,cross_valid,agent) %>%
    + summarise(ha_dist=sum(ha))
    summ
    Source: local data frame [2 x 4]
    Groups: park, cross_valid

      park cross_valid           agent ha_dist
    1 slbe           1 forest_pathogen   18.18
    2 slbe           1   harvest_30_60    2.07
    str(summ)
    Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of  4 variables:
     $ park       : Factor w/ 4 levels "miss","piro",..: 4 4
     $ cross_valid: int  1 1
     $ agent      : Factor w/ 9 levels "agriculture",..: 5 7
     $ ha_dist    : num  18.18 2.07
     - attr(*, "vars")=List of 2
      ..$ : symbol park
      ..$ : symbol cross_valid
     - attr(*, "drop")= logi TRUE

Then I try to drop 'cross_valid'...

sel <- select (summ,-cross_valid)
summ
Source: local data frame [2 x 4]
Groups: park, cross_valid

  park cross_valid           agent ha_dist
1 slbe           1 forest_pathogen   18.18
2 slbe           1   harvest_30_60    2.07
str(summ)
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of  4 variables:
 $ park       : Factor w/ 4 levels "miss","piro",..: 4 4
 $ cross_valid: int  1 1
 $ agent      : Factor w/ 9 levels "agriculture",..: 5 7
 $ ha_dist    : num  18.18 2.07
 - attr(*, "vars")=List of 2
  ..$ : symbol park
  ..$ : symbol cross_valid
 - attr(*, "drop")= logi TRUE
 - attr(*, "indices")=List of 1
  ..$ : int  0 1
 - attr(*, "group_sizes")= int 2
 - attr(*, "biggest_group_size")= int 2
 - attr(*, "labels")='data.frame':  1 obs. of  2 variables:
  ..$ park       : Factor w/ 4 levels "miss","piro",..: 4
  ..$ cross_valid: int 1
  ..- attr(*, "vars")=List of 2
  .. ..$ : symbol park
  .. ..$ : symbol cross_valid

And it won't drop summ$cross_valid

If I use base R to drop cross_valid, it works...

base.sel <- summ[-2]
base.sel
Source: local data frame [2 x 3]
Groups: 

  park           agent ha_dist
1 slbe forest_pathogen   18.18
2 slbe   harvest_30_60    2.07

I can drop orig$cross_valid using select...

drop.orig <- select (orig,-cross_valid)
drop.orig
  park year           agent   ha   loc_01            loc_02 loc_03
1 slbe 2006 forest_pathogen 4.32     slbe slbe_southmanitou   none
2 slbe 2009 forest_pathogen 1.17 non_slbe              none   none
3 slbe 2006 forest_pathogen 3.51     slbe slbe_southmanitou   none
4 slbe 2008   harvest_30_60 2.07 non_slbe              none   none
5 slbe 2009 forest_pathogen 9.18 non_slbe              none   none

Since I can drop the variable with base R, it isn't a big deal, but I thought there may be some glitch with dplyr. It's likely something with the structure of the variable, but I don't know what it would be.

Thanks..

-cherrytree

Upvotes: 7

Views: 558

Answers (1)

akrun
akrun

Reputation: 886938

Try ungroup()

summ%>% 
ungroup() %>%
select(-cross_valid)
#  park           agent ha_dist
#1 slbe forest_pathogen   18.18
#2 slbe   harvest_30_60    2.07



groups(summ)
#[[1]]
#park

#[[2]]
#cross_valid

Upvotes: 9

Related Questions