Kate N
Kate N

Reputation: 453

In R how to fill in numbers that do not appear in the sequence?

I have a data set that list the percentiles for a set of scores like this:

 > percentiles
    Score Percentile
1     231          0
2     385          1
3     403          2
4     413          3
5     418          4
6     424          5
7     429          6
8     434          7
9     437          8
10    441          9
11    443         10

I would like the "Score" column to run from 100 to 500. That is, I would like Scores 100 to 231 to be associated with a Percentile of 0, Scores 232 to 385 to be associated with a Percentile of 1, etc. Is there a simple way to fill in the values that do not appear in the sequence of "Score" values so it looks like the below data set?

> percentiles
    Score Percentile
1     100          0
2     101          0
3     102          0
4     103          0
5     104          0
6     105          0
7     106          0
8     107          0
9     108          0
10    109          0
--------------------
130   229          0
131   230          0
132   231          0
133   232          1
134   233          1
135   234          1
136   235          1
137   236          1
138   237          1
139   238          1
140   239          1


Upvotes: 4

Views: 332

Answers (6)

hello_friend
hello_friend

Reputation: 5788

A bit hacky Base R:

# Create a dataframe with all score values in the range: 

score_range <- merge(data.frame(Score = c(100:500)), percentiles, by = "Score", all.x = TRUE)

# Reverse the order of the dataframe: 

score_range <- score_range[rev(order(score_range$Score)),]

# Change the first NA to the maximum score: 

score_range$Percentile[which(is.na(score_range$Percentile))][1] <- max(score_range$Percentile, na.rm = TRUE)

# Replace all NAs with the value before them: 

score_range$Percentile <- na.omit(score_range$Percentile)[cumsum(!is.na(score_range$Percentile))]

Data:

percentiles <- structure(list(Score = c(231L, 385L, 403L, 413L, 418L, 424L, 
                                        429L, 434L, 437L, 441L, 443L), 
                              Percentile = 0:10), class = "data.frame",
                         row.names = c("1", 
                                       "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"))

Upvotes: 0

ThomasIsCoding
ThomasIsCoding

Reputation: 101753

Here is a base R solution, where cut() and match() are the key points to make it, i.e.,

df <- data.frame(Score = (x <- 100:500), 
                 percentile = percentiles$Percentile[match(s <-cut(x,c(0,percentiles$Score)),levels(s))])

such that

> df
    Score percentile
1     100          0
2     101          0
3     102          0
4     103          0
5     104          0
6     105          0
7     106          0
8     107          0
9     108          0
10    109          0
11    110          0
12    111          0
13    112          0
14    113          0
15    114          0
16    115          0
17    116          0
18    117          0
19    118          0
20    119          0
21    120          0
22    121          0
23    122          0
24    123          0
25    124          0
26    125          0
27    126          0
28    127          0
29    128          0
30    129          0
31    130          0
32    131          0
33    132          0
34    133          0
35    134          0
36    135          0
37    136          0
38    137          0
39    138          0
40    139          0
41    140          0
42    141          0
43    142          0
44    143          0
45    144          0
46    145          0
47    146          0
48    147          0
49    148          0
50    149          0
51    150          0
52    151          0
53    152          0
54    153          0
55    154          0
56    155          0
57    156          0
58    157          0
59    158          0
60    159          0
61    160          0
62    161          0
63    162          0
64    163          0
65    164          0
66    165          0
67    166          0
68    167          0
69    168          0
70    169          0
71    170          0
72    171          0
73    172          0
74    173          0
75    174          0
76    175          0
77    176          0
78    177          0
79    178          0
80    179          0
81    180          0
82    181          0
83    182          0
84    183          0
85    184          0
86    185          0
87    186          0
88    187          0
89    188          0
90    189          0
91    190          0
92    191          0
93    192          0
94    193          0
95    194          0
96    195          0
97    196          0
98    197          0
99    198          0
100   199          0
101   200          0
102   201          0
103   202          0
104   203          0
105   204          0
106   205          0
107   206          0
108   207          0
109   208          0
110   209          0
111   210          0
112   211          0
113   212          0
114   213          0
115   214          0
116   215          0
117   216          0
118   217          0
119   218          0
120   219          0
121   220          0
122   221          0
123   222          0
124   223          0
125   224          0
126   225          0
127   226          0
128   227          0
129   228          0
130   229          0
131   230          0
132   231          0
133   232          1
134   233          1
135   234          1
136   235          1
137   236          1
138   237          1
139   238          1
140   239          1
141   240          1
142   241          1
143   242          1
144   243          1
145   244          1
146   245          1
147   246          1
148   247          1
149   248          1
150   249          1
151   250          1
152   251          1
153   252          1
154   253          1
155   254          1
156   255          1
157   256          1
158   257          1
159   258          1
160   259          1
161   260          1
162   261          1
163   262          1
164   263          1
165   264          1
166   265          1
167   266          1
168   267          1
169   268          1
170   269          1
171   270          1
172   271          1
173   272          1
174   273          1
175   274          1
176   275          1
177   276          1
178   277          1
179   278          1
180   279          1
181   280          1
182   281          1
183   282          1
184   283          1
185   284          1
186   285          1
187   286          1
188   287          1
189   288          1
190   289          1
191   290          1
192   291          1
193   292          1
194   293          1
195   294          1
196   295          1
197   296          1
198   297          1
199   298          1
200   299          1
201   300          1
202   301          1
203   302          1
204   303          1
205   304          1
206   305          1
207   306          1
208   307          1
209   308          1
210   309          1
211   310          1
212   311          1
213   312          1
214   313          1
215   314          1
216   315          1
217   316          1
218   317          1
219   318          1
220   319          1
221   320          1
222   321          1
223   322          1
224   323          1
225   324          1
226   325          1
227   326          1
228   327          1
229   328          1
230   329          1
231   330          1
232   331          1
233   332          1
234   333          1
235   334          1
236   335          1
237   336          1
238   337          1
239   338          1
240   339          1
241   340          1
242   341          1
243   342          1
244   343          1
245   344          1
246   345          1
247   346          1
248   347          1
249   348          1
250   349          1
251   350          1
252   351          1
253   352          1
254   353          1
255   354          1
256   355          1
257   356          1
258   357          1
259   358          1
260   359          1
261   360          1
262   361          1
263   362          1
264   363          1
265   364          1
266   365          1
267   366          1
268   367          1
269   368          1
270   369          1
271   370          1
272   371          1
273   372          1
274   373          1
275   374          1
276   375          1
277   376          1
278   377          1
279   378          1
280   379          1
281   380          1
282   381          1
283   382          1
284   383          1
285   384          1
286   385          1
287   386          2
288   387          2
289   388          2
290   389          2
291   390          2
292   391          2
293   392          2
294   393          2
295   394          2
296   395          2
297   396          2
298   397          2
299   398          2
300   399          2
301   400          2
302   401          2
303   402          2
304   403          2
305   404          3
306   405          3
307   406          3
308   407          3
309   408          3
310   409          3
311   410          3
312   411          3
313   412          3
314   413          3
315   414          4
316   415          4
317   416          4
318   417          4
319   418          4
320   419          5
321   420          5
322   421          5
323   422          5
324   423          5
325   424          5
326   425          6
327   426          6
328   427          6
329   428          6
330   429          6
331   430          7
332   431          7
333   432          7
334   433          7
335   434          7
336   435          8
337   436          8
338   437          8
339   438          9
340   439          9
341   440          9
342   441          9
343   442         10
344   443         10
345   444         NA
346   445         NA
347   446         NA
348   447         NA
349   448         NA
350   449         NA
351   450         NA
352   451         NA
353   452         NA
354   453         NA
355   454         NA
356   455         NA
357   456         NA
358   457         NA
359   458         NA
360   459         NA
361   460         NA
362   461         NA
363   462         NA
364   463         NA
365   464         NA
366   465         NA
367   466         NA
368   467         NA
369   468         NA
370   469         NA
371   470         NA
372   471         NA
373   472         NA
374   473         NA
375   474         NA
376   475         NA
377   476         NA
378   477         NA
379   478         NA
380   479         NA
381   480         NA
382   481         NA
383   482         NA
384   483         NA
385   484         NA
386   485         NA
387   486         NA
388   487         NA
389   488         NA
390   489         NA
391   490         NA
392   491         NA
393   492         NA
394   493         NA
395   494         NA
396   495         NA
397   496         NA
398   497         NA
399   498         NA
400   499         NA
401   500         NA

Upvotes: 1

user2474226
user2474226

Reputation: 1502

In base R you could use the findInterval function to break up your sequence 100:500 into buckets determined by the Score, then index into the Percentile column:

x <- 100:500
ind <- findInterval(x, percentiles$Score, left.open = TRUE)
output <- data.frame(Score = x, Percentile = percentiles$Percentile[ind + 1])

Values of x above 443 will receive a percentile of NA.

Upvotes: 1

camille
camille

Reputation: 16842

You might think about this differently: instead of a data frame to fill, as a set of breaks for binning your scores. Use the scores as the breaks with -Inf tacked on to have the lower bound. If you need something different to happen for the scores above the highest break, add Inf to the end of the breaks, but you'll need to come up with an additional label.

library(dplyr)

dat <- data.frame(Score = 100:500) %>%
  mutate(Percentile = cut(Score, breaks = c(-Inf, percentiles$Score), 
                          labels = percentiles$Percentile, 
                          right = T, include.lowest = F)) 

Taking a look at a few of the breaking points:

slice(dat, c(129:135, 342:346))
#>    Score Percentile
#> 1    228          0
#> 2    229          0
#> 3    230          0
#> 4    231          0
#> 5    232          1
#> 6    233          1
#> 7    234          1
#> 8    441          9
#> 9    442         10
#> 10   443         10
#> 11   444       <NA>
#> 12   445       <NA>

Upvotes: 2

IceCreamToucan
IceCreamToucan

Reputation: 28695

If you convert percentiles to a data.table, you could do a rolling join with a new table of all scores 100:500. The rolling join with roll = -Inf gives a fill-backward behavior by itself, but still the 444:500 values are NA so a forward nafill is added at the end.

library(data.table)
setDT(percentiles)

percentiles[data.table(Score = 100:500), on = .(Score), roll = -Inf
            ][, Percentile := nafill(Percentile, 'locf')]


#      Score Percentile
#   1:   100          0
#   2:   101          0
#   3:   102          0
#   4:   103          0
#   5:   104          0
#  ---                 
# 397:   496         10
# 398:   497         10
# 399:   498         10
# 400:   499         10
# 401:   500         10

Upvotes: 4

akrun
akrun

Reputation: 887231

We could use complete

library(dplyr)
library(tidyr)
out <- complete(percentiles, Score = 100:500) %>%
       fill(Percentile, .direction = "updown")
out %>%
     slice(c(1:10, 130:140)) %>% 
     as.data.frame
#   Score Percentile
#1    100          0
#2    101          0
#3    102          0
#4    103          0
#5    104          0
#6    105          0
#7    106          0
#8    107          0
#9    108          0
#10   109          0
#11   229          0
#12   230          0
#13   231          0
#14   232          1
#15   233          1
#16   234          1
#17   235          1
#18   236          1
#19   237          1
#20   238          1
#21   239          1

data

percentiles <- structure(list(Score = c(231L, 385L, 403L, 413L, 418L, 424L, 
429L, 434L, 437L, 441L, 443L), Percentile = 0:10), class = "data.frame",
row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11"))

Upvotes: 1

Related Questions