Reputation: 587
I am trying to do the following. I have a dataset1 that goes from 2015-01-31 up until 2021-06-30:
dataset1_dates=c("2015-01-31","2015-02-28","2015-03-31","2015-04-30","2015-05-31","2015-06-30","2015-07-31","2015-08-31","2015-09-30","2015-10-31","2015-11-30","2015-12-31","2016-01-31","2016-02-29","2016-03-31","2016-04-30","2016-05-31","2016-06-30","2016-07-31","2016-08-31","2016-09-30","2016-10-31","2016-11-30","2016-12-31","2017-01-31","2017-02-28","2017-03-31","2017-04-30","2017-05-31","2017-06-30","2017-07-31","2017-08-31","2017-09-30","2017-10-31","2017-11-30","2017-12-31","2018-01-31","2018-02-28","2018-03-31","2018-04-30","2018-05-31","2018-06-30","2018-07-31","2018-08-31","2018-09-30","2018-10-31","2018-11-30","2018-12-31","2019-01-31","2019-02-28","2019-03-31","2019-04-30","2019-05-31","2019-06-30","2019-07-31","2019-08-31","2019-09-30","2019-10-31","2019-11-30","2019-12-31","2020-01-31","2020-02-29","2020-03-31","2020-04-30","2020-05-31","2020-06-30","2020-07-31","2020-08-31","2020-09-30","2020-10-31","2020-11-30","2020-12-31","2021-01-31","2021-02-28","2021-03-31","2021-04-30","2021-05-31","2021-06-30")
# add dates
dataset1 <- expand.grid(Organisation = c("A123","B234","C456"),
Date = dataset1_dates)
## sort
dataset1 <- dataset1[order(dataset1$Organisation, dataset1$Date),]
## reset id
rownames(dataset1) <- NULL
dataset1$Organisation <- as.character(dataset1$Organisation)
dataset1$Date <- as.Date(dataset1$Date, format="%Y-%m-%d")
Then I have a dataset2 that tells me at specific points in time the performance of each organisation at the time of inspection:
dataset2 <- read.table(
text = "
Organisation Date_inspection Performance
A123 2015-01-31 Good
A123 2016-01-14 OK
B234 2017-06-14 Inadequate
C456 2015-06-30 OK
C456 2016-02-10 Inspected but not rated
C456 2018-05-18 Good
C456 2020-03-21 OK",
header = TRUE)
dataset2$Organisation <- as.character(dataset2$Organisation)
dataset2$Date_inspection <- as.Date(dataset2$Date_inspection, format="%Y-%m-%d")
dataset2$Performance <- as.character(dataset2$Performance)
I would like to assign to each month before inspection, including the month of the inspection, the performance category of the organisation.
I would also like to consider months after the last inspection as equal to the performance category at date of last inspection.
When 'Inspected but not rated' then assume next category. E.g. for C456 then assume 'Good'.
Expected result:
Date | Organisation | Performance |
2015-01-31 | A123 | Good |
2015-02-28 | A123 | OK |
2015-03-31 | A123 | OK |
...
2016-01-31 | A123 | OK |
...
2021-06-30 | A123 | OK |
2015-01-31 | B234 | Inadequate |
2015-02-28 | B234 | Inadequate |
2015-03-31 | B234 | Inadequate |
...
2021-06-30 | B234 | Inadequate |
2015-01-31 | C456 | OK |
2015-02-28 | C456 | OK |
2015-03-31 | C456 | OK |
...
2015-06-30 | C456 | OK |
...
2016-02-29 | C456 | Good |
...
2018-05-31 | C456 | Good |
2018-06-30 | C456 | OK |
...
2020-03-31 | C456 | OK |
...
2021-06-30 | C456 | OK |
Any ideas on how to do this in R?
Upvotes: 1
Views: 68
Reputation: 16998
Edit: Corrected a mistake. Forgot to group the data.
Edit2: Missed the handling for "Inspected but not rated". Thank you @hello_friend!
I think you can handle this with dplyr
and tidyr
:
library(dplyr)
library(tidyr)
dataset1 %>%
mutate(year_month = format(Date, "%Y-%m")) %>%
left_join(
dataset2 %>%
mutate(year_month = format(Date_inspection, "%Y-%m"),
Performance = na_if(Performance, "Inspected but not rated")),
by = c("Organisation", "year_month")
) %>%
group_by(Organisation) %>%
fill(Performance, .direction = "updown") %>%
select(-year_month, -Date_inspection) %>%
ungroup()
this returns
# A tibble: 234 x 3
Organisation Date Performance
<chr> <date> <chr>
1 A123 2015-01-31 Good
2 A123 2015-02-28 OK
3 A123 2015-03-31 OK
4 A123 2015-04-30 OK
5 A123 2015-05-31 OK
6 A123 2015-06-30 OK
7 A123 2015-07-31 OK
8 A123 2015-08-31 OK
9 A123 2015-09-30 OK
10 A123 2015-10-31 OK
11 A123 2015-11-30 OK
12 A123 2015-12-31 OK
13 A123 2016-01-31 OK
14 A123 2016-02-29 OK
15 A123 2016-03-31 OK
16 A123 2016-04-30 OK
17 A123 2016-05-31 OK
18 A123 2016-06-30 OK
19 A123 2016-07-31 OK
20 A123 2016-08-31 OK
21 A123 2016-09-30 OK
22 A123 2016-10-31 OK
23 A123 2016-11-30 OK
24 A123 2016-12-31 OK
25 A123 2017-01-31 OK
26 A123 2017-02-28 OK
27 A123 2017-03-31 OK
28 A123 2017-04-30 OK
29 A123 2017-05-31 OK
30 A123 2017-06-30 OK
31 A123 2017-07-31 OK
32 A123 2017-08-31 OK
33 A123 2017-09-30 OK
34 A123 2017-10-31 OK
35 A123 2017-11-30 OK
36 A123 2017-12-31 OK
37 A123 2018-01-31 OK
38 A123 2018-02-28 OK
39 A123 2018-03-31 OK
40 A123 2018-04-30 OK
41 A123 2018-05-31 OK
42 A123 2018-06-30 OK
43 A123 2018-07-31 OK
44 A123 2018-08-31 OK
45 A123 2018-09-30 OK
46 A123 2018-10-31 OK
47 A123 2018-11-30 OK
48 A123 2018-12-31 OK
49 A123 2019-01-31 OK
50 A123 2019-02-28 OK
51 A123 2019-03-31 OK
52 A123 2019-04-30 OK
53 A123 2019-05-31 OK
54 A123 2019-06-30 OK
55 A123 2019-07-31 OK
56 A123 2019-08-31 OK
57 A123 2019-09-30 OK
58 A123 2019-10-31 OK
59 A123 2019-11-30 OK
60 A123 2019-12-31 OK
61 A123 2020-01-31 OK
62 A123 2020-02-29 OK
63 A123 2020-03-31 OK
64 A123 2020-04-30 OK
65 A123 2020-05-31 OK
66 A123 2020-06-30 OK
67 A123 2020-07-31 OK
68 A123 2020-08-31 OK
69 A123 2020-09-30 OK
70 A123 2020-10-31 OK
71 A123 2020-11-30 OK
72 A123 2020-12-31 OK
73 A123 2021-01-31 OK
74 A123 2021-02-28 OK
75 A123 2021-03-31 OK
76 A123 2021-04-30 OK
77 A123 2021-05-31 OK
78 A123 2021-06-30 OK
79 B234 2015-01-31 Inadequate
80 B234 2015-02-28 Inadequate
81 B234 2015-03-31 Inadequate
82 B234 2015-04-30 Inadequate
83 B234 2015-05-31 Inadequate
84 B234 2015-06-30 Inadequate
85 B234 2015-07-31 Inadequate
86 B234 2015-08-31 Inadequate
87 B234 2015-09-30 Inadequate
88 B234 2015-10-31 Inadequate
89 B234 2015-11-30 Inadequate
90 B234 2015-12-31 Inadequate
91 B234 2016-01-31 Inadequate
92 B234 2016-02-29 Inadequate
93 B234 2016-03-31 Inadequate
94 B234 2016-04-30 Inadequate
95 B234 2016-05-31 Inadequate
96 B234 2016-06-30 Inadequate
97 B234 2016-07-31 Inadequate
98 B234 2016-08-31 Inadequate
99 B234 2016-09-30 Inadequate
100 B234 2016-10-31 Inadequate
101 B234 2016-11-30 Inadequate
102 B234 2016-12-31 Inadequate
103 B234 2017-01-31 Inadequate
104 B234 2017-02-28 Inadequate
105 B234 2017-03-31 Inadequate
106 B234 2017-04-30 Inadequate
107 B234 2017-05-31 Inadequate
108 B234 2017-06-30 Inadequate
109 B234 2017-07-31 Inadequate
110 B234 2017-08-31 Inadequate
111 B234 2017-09-30 Inadequate
112 B234 2017-10-31 Inadequate
113 B234 2017-11-30 Inadequate
114 B234 2017-12-31 Inadequate
115 B234 2018-01-31 Inadequate
116 B234 2018-02-28 Inadequate
117 B234 2018-03-31 Inadequate
118 B234 2018-04-30 Inadequate
119 B234 2018-05-31 Inadequate
120 B234 2018-06-30 Inadequate
121 B234 2018-07-31 Inadequate
122 B234 2018-08-31 Inadequate
123 B234 2018-09-30 Inadequate
124 B234 2018-10-31 Inadequate
125 B234 2018-11-30 Inadequate
126 B234 2018-12-31 Inadequate
127 B234 2019-01-31 Inadequate
128 B234 2019-02-28 Inadequate
129 B234 2019-03-31 Inadequate
130 B234 2019-04-30 Inadequate
131 B234 2019-05-31 Inadequate
132 B234 2019-06-30 Inadequate
133 B234 2019-07-31 Inadequate
134 B234 2019-08-31 Inadequate
135 B234 2019-09-30 Inadequate
136 B234 2019-10-31 Inadequate
137 B234 2019-11-30 Inadequate
138 B234 2019-12-31 Inadequate
139 B234 2020-01-31 Inadequate
140 B234 2020-02-29 Inadequate
141 B234 2020-03-31 Inadequate
142 B234 2020-04-30 Inadequate
143 B234 2020-05-31 Inadequate
144 B234 2020-06-30 Inadequate
145 B234 2020-07-31 Inadequate
146 B234 2020-08-31 Inadequate
147 B234 2020-09-30 Inadequate
148 B234 2020-10-31 Inadequate
149 B234 2020-11-30 Inadequate
150 B234 2020-12-31 Inadequate
151 B234 2021-01-31 Inadequate
152 B234 2021-02-28 Inadequate
153 B234 2021-03-31 Inadequate
154 B234 2021-04-30 Inadequate
155 B234 2021-05-31 Inadequate
156 B234 2021-06-30 Inadequate
157 C456 2015-01-31 OK
158 C456 2015-02-28 OK
159 C456 2015-03-31 OK
160 C456 2015-04-30 OK
161 C456 2015-05-31 OK
162 C456 2015-06-30 OK
163 C456 2015-07-31 Good
164 C456 2015-08-31 Good
165 C456 2015-09-30 Good
166 C456 2015-10-31 Good
167 C456 2015-11-30 Good
168 C456 2015-12-31 Good
169 C456 2016-01-31 Good
170 C456 2016-02-29 Good
171 C456 2016-03-31 Good
172 C456 2016-04-30 Good
173 C456 2016-05-31 Good
174 C456 2016-06-30 Good
175 C456 2016-07-31 Good
176 C456 2016-08-31 Good
177 C456 2016-09-30 Good
178 C456 2016-10-31 Good
179 C456 2016-11-30 Good
180 C456 2016-12-31 Good
181 C456 2017-01-31 Good
182 C456 2017-02-28 Good
183 C456 2017-03-31 Good
184 C456 2017-04-30 Good
185 C456 2017-05-31 Good
186 C456 2017-06-30 Good
187 C456 2017-07-31 Good
188 C456 2017-08-31 Good
189 C456 2017-09-30 Good
190 C456 2017-10-31 Good
191 C456 2017-11-30 Good
192 C456 2017-12-31 Good
193 C456 2018-01-31 Good
194 C456 2018-02-28 Good
195 C456 2018-03-31 Good
196 C456 2018-04-30 Good
197 C456 2018-05-31 Good
198 C456 2018-06-30 OK
199 C456 2018-07-31 OK
200 C456 2018-08-31 OK
201 C456 2018-09-30 OK
202 C456 2018-10-31 OK
203 C456 2018-11-30 OK
204 C456 2018-12-31 OK
205 C456 2019-01-31 OK
206 C456 2019-02-28 OK
207 C456 2019-03-31 OK
208 C456 2019-04-30 OK
209 C456 2019-05-31 OK
210 C456 2019-06-30 OK
211 C456 2019-07-31 OK
212 C456 2019-08-31 OK
213 C456 2019-09-30 OK
214 C456 2019-10-31 OK
215 C456 2019-11-30 OK
216 C456 2019-12-31 OK
217 C456 2020-01-31 OK
218 C456 2020-02-29 OK
219 C456 2020-03-31 OK
220 C456 2020-04-30 OK
221 C456 2020-05-31 OK
222 C456 2020-06-30 OK
223 C456 2020-07-31 OK
224 C456 2020-08-31 OK
225 C456 2020-09-30 OK
226 C456 2020-10-31 OK
227 C456 2020-11-30 OK
228 C456 2020-12-31 OK
229 C456 2021-01-31 OK
230 C456 2021-02-28 OK
231 C456 2021-03-31 OK
232 C456 2021-04-30 OK
233 C456 2021-05-31 OK
234 C456 2021-06-30 OK
##Data Here are the data after all those transforming shown in the question.
dataset1 <- structure(list(Organisation = c("A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456"), Date = structure(c(16466,
16494, 16525, 16555, 16586, 16616, 16647, 16678, 16708, 16739,
16769, 16800, 16831, 16860, 16891, 16921, 16952, 16982, 17013,
17044, 17074, 17105, 17135, 17166, 17197, 17225, 17256, 17286,
17317, 17347, 17378, 17409, 17439, 17470, 17500, 17531, 17562,
17590, 17621, 17651, 17682, 17712, 17743, 17774, 17804, 17835,
17865, 17896, 17927, 17955, 17986, 18016, 18047, 18077, 18108,
18139, 18169, 18200, 18230, 18261, 18292, 18321, 18352, 18382,
18413, 18443, 18474, 18505, 18535, 18566, 18596, 18627, 18658,
18686, 18717, 18747, 18778, 18808, 16466, 16494, 16525, 16555,
16586, 16616, 16647, 16678, 16708, 16739, 16769, 16800, 16831,
16860, 16891, 16921, 16952, 16982, 17013, 17044, 17074, 17105,
17135, 17166, 17197, 17225, 17256, 17286, 17317, 17347, 17378,
17409, 17439, 17470, 17500, 17531, 17562, 17590, 17621, 17651,
17682, 17712, 17743, 17774, 17804, 17835, 17865, 17896, 17927,
17955, 17986, 18016, 18047, 18077, 18108, 18139, 18169, 18200,
18230, 18261, 18292, 18321, 18352, 18382, 18413, 18443, 18474,
18505, 18535, 18566, 18596, 18627, 18658, 18686, 18717, 18747,
18778, 18808, 16466, 16494, 16525, 16555, 16586, 16616, 16647,
16678, 16708, 16739, 16769, 16800, 16831, 16860, 16891, 16921,
16952, 16982, 17013, 17044, 17074, 17105, 17135, 17166, 17197,
17225, 17256, 17286, 17317, 17347, 17378, 17409, 17439, 17470,
17500, 17531, 17562, 17590, 17621, 17651, 17682, 17712, 17743,
17774, 17804, 17835, 17865, 17896, 17927, 17955, 17986, 18016,
18047, 18077, 18108, 18139, 18169, 18200, 18230, 18261, 18292,
18321, 18352, 18382, 18413, 18443, 18474, 18505, 18535, 18566,
18596, 18627, 18658, 18686, 18717, 18747, 18778, 18808), class = "Date")), out.attrs = list(
dim = c(Organisation = 3L, Date = 78L), dimnames = list(Organisation = c("Organisation=A123",
"Organisation=B234", "Organisation=C456"), Date = c("Date=2015-01-31",
"Date=2015-02-28", "Date=2015-03-31", "Date=2015-04-30",
"Date=2015-05-31", "Date=2015-06-30", "Date=2015-07-31",
"Date=2015-08-31", "Date=2015-09-30", "Date=2015-10-31",
"Date=2015-11-30", "Date=2015-12-31", "Date=2016-01-31",
"Date=2016-02-29", "Date=2016-03-31", "Date=2016-04-30",
"Date=2016-05-31", "Date=2016-06-30", "Date=2016-07-31",
"Date=2016-08-31", "Date=2016-09-30", "Date=2016-10-31",
"Date=2016-11-30", "Date=2016-12-31", "Date=2017-01-31",
"Date=2017-02-28", "Date=2017-03-31", "Date=2017-04-30",
"Date=2017-05-31", "Date=2017-06-30", "Date=2017-07-31",
"Date=2017-08-31", "Date=2017-09-30", "Date=2017-10-31",
"Date=2017-11-30", "Date=2017-12-31", "Date=2018-01-31",
"Date=2018-02-28", "Date=2018-03-31", "Date=2018-04-30",
"Date=2018-05-31", "Date=2018-06-30", "Date=2018-07-31",
"Date=2018-08-31", "Date=2018-09-30", "Date=2018-10-31",
"Date=2018-11-30", "Date=2018-12-31", "Date=2019-01-31",
"Date=2019-02-28", "Date=2019-03-31", "Date=2019-04-30",
"Date=2019-05-31", "Date=2019-06-30", "Date=2019-07-31",
"Date=2019-08-31", "Date=2019-09-30", "Date=2019-10-31",
"Date=2019-11-30", "Date=2019-12-31", "Date=2020-01-31",
"Date=2020-02-29", "Date=2020-03-31", "Date=2020-04-30",
"Date=2020-05-31", "Date=2020-06-30", "Date=2020-07-31",
"Date=2020-08-31", "Date=2020-09-30", "Date=2020-10-31",
"Date=2020-11-30", "Date=2020-12-31", "Date=2021-01-31",
"Date=2021-02-28", "Date=2021-03-31", "Date=2021-04-30",
"Date=2021-05-31", "Date=2021-06-30"))), row.names = c(NA,
-234L), class = "data.frame")
dataset2 <- structure(list(Organisation = c("A123", "A123", "B234", "C456",
"C456", "C456", "C456"), Date_inspection = structure(c(16466,
16814, 17331, 16616, 16841, 17669, 18342), class = "Date"), Performance = c("Good",
"OK", "Inadequate", "OK", "Inspected but not rated", "Good",
"OK")), row.names = c(NA, -7L), class = "data.frame")
Upvotes: 2
Reputation: 5798
I'm sure this can be simplified but will work as desired:
# Recode Inspected but not rated to an NA of type
# character: clean_df2 => data.frame
clean_df2 <- transform(
with(
dataset2,
dataset2[
rev(
order(
Date_inspection
)
),
]
),
Performance = gsub(
"Inspected but not rated",
NA_character_,
Performance
)
)
# Expand the "dataset2" to months which the ratings
# are considered applicable over:
# inspectionsApplicable => data.frame
inspectionsApplicable <- unique(
data.frame(
do.call(
rbind,
lapply(
with(
clean_df2,
split(
clean_df2,
Organisation
)
),
function(x){
x$Month_inspected <- as.Date(
strftime(
x$Date_inspection,
"%Y-%m-01"
)
)
MaxMonthInData <- as.Date(
strftime(
max(
dataset1$Date[
dataset1$Organisation ==
unique(x$Organisation)
]
),
"%Y-%m-01"
)
)
data.frame(
Organisation = c(
x$Organisation[1],
x$Organisation
),
Months = c(
as.Date(MaxMonthInData),
as.Date(x$Month_inspected, "%Y-%m-%d")
),
Performance = c(
x$Performance[
which.min(
cumsum(
!(
is.na(
x$Performance
)
)
)
)
],
x$Performance
)
)
}
)
),
row.names = NULL
)
)
# Left join the tables, dropping dupes from
# from inspection result data.frame: ir_res => data.frame
ir_res <- merge(
transform(
with(
dataset1,
dataset1[
rev(
order(
Organisation,
Date
)
),
]
),
Months = as.Date(
strftime(
Date,
"%Y-%m-01"
)
)
),
with(
inspectionsApplicable,
inspectionsApplicable[
!(
duplicated(
paste0(
Organisation,
Months
),
fromLast = TRUE
)
),
]
),
by = c(
"Organisation",
"Months"
),
all.x = TRUE
)
# Back fill by group: res_ir2 => data.frame
res_ir2 <- do.call(
rbind,
lapply(
with(
ir_res,
split(
ir_res,
Organisation
)
),
function(x){
y <- with(
x,
x[
rev(
order(
Date
)
),
]
)
transform(
y,
Performance = na.omit(
Performance
)[
cumsum(
!(
is.na(
Performance
)
)
)
]
)
}
)
)
# Order by date and organisation: res => data.frame
res <- data.frame(
with(
res_ir2,
res_ir2[
order(
Organisation,
Date
),
]
),
row.names = NULL
)
Upvotes: 2