Tom
Tom

Reputation: 417

Subsetting my data is not yielding the expected results

I have a dataset of 93 observations. There are only two variables, a factor (size, a number) and it's response (percent, also a number). The factor value ranges from 0-2000. I would like to combine these 93 observations into three groups based on factor values (0-2, 2-50, and 50-2000) and see the total combined response value for each.

Here is my data:

> data2
   run  size percentage
1    1 0.375      0.010
2    2 0.412      0.020
3    3 0.452      0.032
4    4 0.496      0.043
5    5 0.545      0.053
6    6 0.598      0.060
7    7 0.656      0.066
8    8 0.721      0.070
9    9 0.791      0.071
10  10 0.868      0.072
11  11 0.953      0.070
12  12 1.047      0.069
13  13 1.149      0.067
14  14 1.261      0.065
15  15 1.385      0.065
16  16 1.520      0.066
17  17 1.668      0.068
18  18 1.832      0.072
19  19 2.011      0.077
20  20 2.207      0.083
21  21 2.423      0.090
22  22 2.660      0.097
23  23 2.920       0.10
24  24 3.205       0.11
25  25 3.519       0.12
26  26 3.863       0.13
27  27 4.240       0.13
28  28 4.655       0.14
29  29 5.110       0.14
30  30 5.610       0.14
31  31 6.158       0.14
32  32 6.760       0.14
33  33 7.421       0.15
34  34 8.147       0.15
35  35 8.943       0.15
36  36 9.817       0.16
37  37 10.78       0.18
38  38 11.83       0.19
39  39 12.99       0.21
40  40 14.26       0.23
41  41 15.65       0.24
42  42 17.18       0.25
43  43 18.86       0.27
44  44 20.70       0.28
45  45 22.73       0.30
46  46 24.95       0.30
47  47 27.39       0.29
48  48 30.07       0.27
49  49 33.01       0.23
50  50 36.24       0.21
51  51 39.78       0.20
52  52 43.67       0.21
53  53 47.94       0.22
54  54 52.62       0.19
55  55 57.77       0.13
56  56 63.41      0.070
57  57 69.61      0.055
58  58 76.42      0.087
59  59 83.89       0.14
60  60 92.09       0.17
61  61 101.1       0.17
62  62 111.0       0.18
63  63 121.8       0.27
64  64 133.7       0.43
65  65 146.8       0.64
66  66 161.2       0.88
67  67 176.9       1.16
68  68 194.2       1.51
69  69 213.2       1.94
70  70 234.1       2.47
71  71 256.9       3.16
72  72 282.1       4.03
73  73 309.6       5.02
74  74 339.9       6.05
75  75 373.1       6.96
76  76 409.6       7.63
77  77 449.7       8.01
78  78 493.6       8.08
79  79 541.9       7.82
80  80 594.9       7.13
81  81 653.0       6.01
82  82 716.8       4.81
83  83 786.9       3.57
84  84 863.9       2.09
85  85 948.3       1.01
86  86  1041       0.55
87  87  1143       0.22
88  88  1255      0.018
89  89  1377          0
90  90  1512          0
91  91  1660          0
92  92  1822          0
93      2000    

Here is a output of dput

dput(data2)
structure(list(run = c("1", "2", "3", "4", "5", "6", "7", "8", 
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", 
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", 
"31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", 
"42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", 
"53", "54", "55", "56", "57", "58", "59", "60", "61", "62", "63", 
"64", "65", "66", "67", "68", "69", "70", "71", "72", "73", "74", 
"75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", 
"86", "87", "88", "89", "90", "91", "92", ""), size = c("0.375", 
"0.412", "0.452", "0.496", "0.545", "0.598", "0.656", "0.721", 
"0.791", "0.868", "0.953", "1.047", "1.149", "1.261", "1.385", 
"1.520", "1.668", "1.832", "2.011", "2.207", "2.423", "2.660", 
"2.920", "3.205", "3.519", "3.863", "4.240", "4.655", "5.110", 
"5.610", "6.158", "6.760", "7.421", "8.147", "8.943", "9.817", 
"10.78", "11.83", "12.99", "14.26", "15.65", "17.18", "18.86", 
"20.70", "22.73", "24.95", "27.39", "30.07", "33.01", "36.24", 
"39.78", "43.67", "47.94", "52.62", "57.77", "63.41", "69.61", 
"76.42", "83.89", "92.09", "101.1", "111.0", "121.8", "133.7", 
"146.8", "161.2", "176.9", "194.2", "213.2", "234.1", "256.9", 
"282.1", "309.6", "339.9", "373.1", "409.6", "449.7", "493.6", 
"541.9", "594.9", "653.0", "716.8", "786.9", "863.9", "948.3", 
"1041", "1143", "1255", "1377", "1512", "1660", "1822", "2000"
), percentage = c("0.013", "0.023", "0.034", "0.049", "0.061", 
"0.072", "0.083", "0.093", "0.10", "0.11", "0.12", "0.12", "0.13", 
"0.14", "0.14", "0.15", "0.15", "0.16", "0.17", "0.17", "0.18", 
"0.19", "0.20", "0.21", "0.22", "0.24", "0.25", "0.26", "0.28", 
"0.30", "0.31", "0.33", "0.35", "0.37", "0.39", "0.42", "0.45", 
"0.47", "0.50", "0.53", "0.56", "0.58", "0.59", "0.59", "0.58", 
"0.55", "0.52", "0.49", "0.46", "0.45", "0.45", "0.45", "0.44", 
"0.42", "0.38", "0.35", "0.32", "0.31", "0.33", "0.36", "0.42", 
"0.49", "0.59", "0.74", "0.94", "1.19", "1.49", "1.82", "2.18", 
"2.55", "2.94", "3.34", "3.78", "4.25", "4.73", "5.20", "5.60", 
"5.87", "5.93", "5.77", "5.37", "4.77", "4.03", "3.21", "2.36", 
"1.55", "0.81", "0.30", "0.056", "0.0044", "0", "0", "")), class = "data.frame", row.names = c(NA, 
-93L))

I have tried the following code, which I would think should give me the results that I'd like:

clay <- data2 %>% filter(size <= 2)
silt <- data2 %>% filter(size > 2 & size <= 50)
sand <- data2 %>% filter(size > 50 & size <= 2000)
sum(as.numeric(clay$percentage), na.rm=TRUE)
[1] 8.637
sum(as.numeric(silt$percentage), na.rm=TRUE)
[1] 57.217
sum(as.numeric(sand$percentage), na.rm=TRUE)
[1] 0

However, as you can see, the totals do not add up to what they should when looking at the dataset.

Any idea what I am doing wrong here?

Thanks in advance for any help

Upvotes: 0

Views: 66

Answers (2)

MrCorote
MrCorote

Reputation: 563

First, you can try to convert the factor variable into numeric:

data2$size <- as.numeric(levels(data2$size))[data2$size]

and then create a column with each group, which i think it's better than creating three separate data-frames.

library(dplyr)

data2 <- data2 %>% 
  mutate(
    group = case_when(
      size < 2 ~ 'Group 1',
      size >= 2 & size < 50 ~'Group 2',
      size >= 50 ~ 'Group 3',
    ))

It yields the same result as in @akrun:

sum(data2$percentage[data2$group=='Group 1'])
#[1] 1.75
sum(data2$percentage[data2$group=='Group 2'])
#[1] 13.5
sum(data2$percentage[data2$group=='Group 3'])
#[1] 84.8

Upvotes: 1

akrun
akrun

Reputation: 887223

The issue is the type of the columns. The columns types can be automatically changed with type.convert from base R or type_convert from readr

library(dplyr)
library(readr) 
data2 <- data2 %>%
            type_convert 

Note that the comparisons with character class is not doing what we expect to

'13.5' > 2
#[1] FALSE

as according to ?Comparison

If the two arguments are atomic vectors of different types, one is coerced to the type of the other, the (decreasing) order of precedence being character, complex, numeric, integer, logical and raw.


After changing the type, running the OP's code gives

sum(clay$percentage, na.rm=TRUE)
#[1] 1.748
sum(silt$percentage, na.rm=TRUE)
#[1] 13.5
sum(sand$percentage, na.rm=TRUE)
#[1] 84.7504

Upvotes: 0

Related Questions