-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_data.Rmd
393 lines (326 loc) · 15.7 KB
/
make_data.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
---
title: "Generating the data from the CliRes database"
output:
html_document:
theme: cosmo
toc: true
editor_options:
chunk_output_type: console
css: style.css
---
```{r general_options, include = FALSE}
knitr::knit_hooks$set(
margin = function(before, options, envir) {
if (before) par(mgp = c(1.5, .5, 0), bty = "n", plt = c(.105, .97, .13, .97))
else NULL
},
prompt = function(before, options, envir) {
options(prompt = if (options$engine %in% c("sh", "bash")) "$ " else "> ")
})
knitr::opts_chunk$set(margin = TRUE, prompt = TRUE, comment = "",
collapse = TRUE, cache = FALSE, autodep = TRUE,
dev.args = list(pointsize = 11), fig.height = 3.5,
fig.width = 4.24725, fig.retina = 2, fig.align = "center")
options(width = 250)
```
## CliRes excel files
The CliRes data are in 2 excel files, each with many tabs. One of these files
contains general information and the other one contains the surveillance data:
```{r}
general_data <- "https://www.dropbox.com/s/3hfnhd39fxjmcnh/20-5-2019-_18ZN_Category_V1_Data.xls?dl=0"
surveillance_data <- "https://www.dropbox.com/s/4kk6qewmntw3q5o/20-5-2019-_18ZN_V1_Data.xls?dl=0"
```
## Packages
Installing the required packages:
```{r}
required <- c("dplyr", "magrittr", "purrr", "readxl", "tibble", "tidyr")
to_install <- setdiff(required, row.names(installed.packages()))
if (length(to_install)) install.packages(to_install)
```
Loading `magrittr`:
```{r}
library(magrittr)
```
## Utilitary functions
Naming output of `lapply()`:
```{r}
lapply2 <- function(X, FUN, ...) setNames(lapply(X, FUN, ...), X)
```
Tuning the `download.file()` function in order to be able to download from a
Dropbox link:
```{r}
download_dropbox <- function(url, ...) download.file(sub("dl=0", "raw=1", url), ...)
```
The following function returns the flocks with missing weeks:
```{r}
missing_weeks <- function(x) {
tmp <- x %>%
dplyr::group_by(USUBJID, FLOCKSEQUENCE) %>%
dplyr::arrange(WEEK) %>%
dplyr::summarise(a = length(unique(diff(WEEK)))) %>%
dplyr::ungroup() %>%
dplyr::filter(a > 1)
if (nrow(tmp)) return(purrr::map2(tmp$USUBJID,
tmp$FLOCKSEQUENCE,
function(farm, flock)
x %>%
dplyr::filter(USUBJID == farm, FLOCKSEQUENCE == flock) %>%
dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK)))
message("There is no missing week.")
invisible(0)
}
```
## ESVAC's IU-to-mg conversion data
The
[ESVAC](https://www.ema.europa.eu/en/veterinary-regulatory/overview/antimicrobial-resistance/european-surveillance-veterinary-antimicrobial-consumption-esvac)
conversion factors are available from the table 2 of page 62 of this
[OIE](http://www.oie.int)
[report](http://www.oie.int/fileadmin/Home/fr/Our_scientific_expertise/docs/pdf/AMR/Survey_on_monitoring_antimicrobial_agents_Dec2016.pdf).
A copy-paste of this table without headers is available
[here](https://raw.githubusercontent.com/viparc/clires_data/master/raw_data/esvac.txt).
Let's download this file:
```{r}
tmp <- tempfile()
download.file("https://raw.githubusercontent.com/viparc/clires_data/master/raw_data/esvac.txt", tmp)
```
And read and reformat the data into a named numeric vector:
```{r}
(esvac <- tmp %>%
readLines() %>%
sub("^#.*", "", .) %>%
sub(" *", "", .) %>%
{.[. != ""]} %>%
matrix(ncol = 4, byrow = TRUE) %>%
as.data.frame(stringsAsFactors = FALSE) %>%
setNames(c("ab_vet_med", "ab_oie", "IU.mg", "mg.IU")) %>%
dplyr::filter(!grepl("methane", ab_vet_med)) %>%
dplyr::transmute(ab_oie = sub(" .*$", "", tolower(ab_oie)),
mg.IU = as.numeric(mg.IU)) %>%
dplyr::bind_rows(data.frame(ab_oie = "josamycin",
mg.IU = 0.001, stringsAsFactors = FALSE)) %$%
setNames(mg.IU, ab_oie))
```
Note that we got rid off one of the colistin data and that we manually added the
data for josamycin that was absent from the table.
## Drug codes from general information data
Downloading the general data file:
```{r}
tmp <- tempfile()
download_dropbox(general_data, tmp)
```
Reformating and merging with the `esvac` dataframe that we generated above:
```{r}
(drug_codes <- c("ANTIMICROBIAL", "ANTIMICROBIAL_GridAnti") %>%
lapply(readxl::read_excel, path = tmp) %>%
c(list(c("ANTIMICROBIAL_SEQ"))) %>%
do.call(dplyr::right_join, .) %>%
dplyr::select(CODE, ANTINAME1, CONCENTRATION, GRIDUNIT, GRIDPACK, GRIDPACKUNIT) %>%
dplyr::filter(! ANTINAME1 %in% c("alicin", "axit oxolinic", "iodo-hydroxyquinoline", "metronidazol", "nystatin")) %>%
dplyr::mutate(ANTINAME1 = dplyr::recode(ANTINAME1, sunfadimethoxine = "sulfadimethoxine",
sunphamethoxazole = "sulphamethoxazole"),
GRIDUNIT = dplyr::na_if(GRIDUNIT, "NA"),
GRIDUNIT = dplyr::na_if(GRIDUNIT, "na"),
GRIDUNIT = dplyr::recode(GRIDUNIT, UI = "IU", mg = "MG"),
GRIDPACKUNIT = dplyr::na_if(GRIDPACKUNIT, "na"),
GRIDPACKUNIT = dplyr::na_if(GRIDPACKUNIT, "VI"),
GRIDPACKUNIT = dplyr::recode(GRIDPACKUNIT, g = "G", ml = "G", ML = "G"),
GRIDPACK = dplyr::na_if(GRIDPACK, "na"),
GRIDPACK = as.numeric(GRIDPACK),
CONCENTRATION = dplyr::na_if(CONCENTRATION, "NA"),
CONCENTRATION = dplyr::na_if(CONCENTRATION, "na"),
CONCENTRATION = dplyr::recode(CONCENTRATION, S = "500"), # this will ultimately be corrected in CliRes CORRECTION
CONCENTRATION = as.numeric(CONCENTRATION),
CONCENTRATION = ifelse(GRIDUNIT == "IU", CONCENTRATION * esvac[ANTINAME1], CONCENTRATION),
GRIDUNIT = ifelse(GRIDUNIT == "IU", "MG", GRIDUNIT), # linked to the above line
CONCENTRATION = ifelse(GRIDUNIT == "MG", CONCENTRATION / 1000, CONCENTRATION),
GRIDUNIT = ifelse(GRIDUNIT == "MG", "G", GRIDUNIT), # linked to the above line
GRIDPACK = ifelse(GRIDPACKUNIT == "KG", 1000 * GRIDPACK, GRIDPACK),
GRIDPACKUNIT = ifelse(GRIDPACKUNIT == "KG", "G", GRIDPACKUNIT), # linked to the above line
proportion = CONCENTRATION / GRIDPACK) %>%
dplyr::select(CODE, ANTINAME1, proportion))
```
## The surveillance data
Downloading the surveillance data file:
```{r}
tmp <- tempfile()
download_dropbox(surveillance_data, tmp)
```
For antimicrobial, disease and chicken data in the surveillance file, we need
the following pairs of tabs respectively, that we have to merge:
```{r}
antimicrobial <- c("ANTIMICROBIAL", "ANTIMICROBIAL_GridEvent")
disease <- c("DISEASE", "DISEASE_GridEvent")
chicks <- c("MID_INOUT", "MID_INOUT_GridEvent")
```
Reading the data from file:
```{r}
(surveillance <- lapply2(c(antimicrobial, disease, chicks, "SAMPLE"),
readxl::read_excel, path = tmp))
```
### Sampling dates
Retrieving the sampling dates. One key feature of the code below is to convert
a sampling date into a week index. `SAMPLINGVISIT` tells whether the visit is at
the start (`S`), the middle (`M`) or the end (`E`) of the flock.
```{r}
(samples <- surveillance$SAMPLE %>%
# End sampling of flock 2 of farm 75-013 is duplicated, so we filter it out: CORRECTION
dplyr::filter(!(USUBJID == "75-013" & FLOCKSEQUENCE == "02" & SAMPLE_SEQ == 8)) %>%
dplyr::select(USUBJID, FLOCKSEQUENCE, SAMPLINGDATE, SAMPLINGVISIT) %>%
assign("tmp", ., 1) %>% # for reuse 4 lines later
dplyr::filter(SAMPLINGVISIT == "S") %>% # | gets the date...
dplyr::select(-SAMPLINGVISIT) %>% # | ... of the first week...
dplyr::rename(start = SAMPLINGDATE) %>% # | ... of each flock.
dplyr::right_join(tmp, c("USUBJID", "FLOCKSEQUENCE")) %>%
dplyr::mutate(WEEK = as.numeric(floor((SAMPLINGDATE - start) / (60 * 60 * 24 * 7)) + 1),
sampling = TRUE) %>%
dplyr::select(-start, -SAMPLINGDATE, -SAMPLINGVISIT) %>%
unique()) # dates of first visits are missing for flock 2 of farm 75-019, THIS NEED TO BE FIXED!!!!!
# flock 3 of farm 75-029 and flock 2 of farm 75-058. CORRECTION
```
This data frame contains all the dates of sampling, one per row.
### Clinical signs
Retrieving the clinical signs data. Note that there are 2 corrections that we
need to do here.
```{r}
(clinical_signs <- surveillance[disease] %>%
unname() %>% # because of the do.call() call that follows
c(list(c("USUBJID", "DISEASE_SEQ"))) %>%
do.call(dplyr::left_join, .) %>%
dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK, RESPIRATORY, DIARRHOEA, CNS, MALAISE,
LEGLESIONS, SUDDENDEATH, DISEASE_GridEvent_SEQ, CHICKENDISEASEDEATH, CHICKENSUDDENDEATH) %>%
dplyr::mutate(CHICKENDISEASEDEATH = tidyr::replace_na(CHICKENDISEASEDEATH, 0),
CHICKENSUDDENDEATH = tidyr::replace_na(CHICKENSUDDENDEATH, 0)) %>%
assign("tmp", ., 1) %>% # for reuse 3 lines later
# CORRECTION 1: weeks 8 instead of 9 for one of the two records of cycle 2 of farm 8:
dplyr::filter(USUBJID == "75-008", FLOCKSEQUENCE == "02", DISEASE_GridEvent_SEQ == 9) %>%
dplyr::mutate(WEEK = 9) %>%
dplyr::bind_rows(dplyr::filter(tmp, !(USUBJID == "75-008" & FLOCKSEQUENCE == "02" & DISEASE_GridEvent_SEQ == 9))) %>%
assign("tmp", ., 1) %>% # for reuse 3 lines later
# CORRECTION 2: week 19 instead of 10 for cycle 6 of farm 21:
dplyr::filter(USUBJID == "75-021", FLOCKSEQUENCE == "06", DISEASE_GridEvent_SEQ == 10) %>%
dplyr::mutate(WEEK = 10) %>%
dplyr::bind_rows(dplyr::filter(tmp, !(USUBJID == "75-021" & FLOCKSEQUENCE == "06" & DISEASE_GridEvent_SEQ == 10))) %>%
dplyr::select(-DISEASE_GridEvent_SEQ))
```
This data frame contains all the weeks of all the flocks of all the farms. There
is no missing weeks:
```{r}
missing_weeks(clinical_signs)
```
### Antimicrobial usage
Here we'll merge with the `drug_codes` dataframe that we generated above.
```{r}
(amu <- surveillance[antimicrobial] %>%
unname() %>% # because of the do.call() call that follows
c(list(c("USUBJID", "ANTIMICROBIAL_SEQ"))) %>%
do.call(dplyr::left_join, .) %>%
dplyr::select(USUBJID, FLOCKSEQUENCE, WEEKNO, CODE, AMOUTUSED, PACKAGEUNIT) %>%
dplyr::rename(WEEK = WEEKNO) %>%
dplyr::mutate(PACKAGEUNIT = dplyr::na_if(PACKAGEUNIT, "TAB")) %>%
dplyr::left_join(drug_codes, "CODE") %>%
tidyr::replace_na(list(ANTINAME1 = "unknown")) %>% # because some antibiotic names are unknown
dplyr::mutate(antibiotic = proportion * AMOUTUSED,
ANTINAME1 = paste0(ANTINAME1, "_g")) %>%
dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK, ANTINAME1, antibiotic) %>%
tibble::rowid_to_column() %>% # spread() requires unique row identifiers
tidyr::spread(ANTINAME1, antibiotic) %>%
dplyr::select(-rowid) %>% # we don't need rowid anymore
replace(is.na(.), 0) %>%
dplyr::group_by(USUBJID, FLOCKSEQUENCE, WEEK) %>%
dplyr::summarise_all(~sum(.)) %>%
dplyr::ungroup() %>%
dplyr::mutate_at(dplyr::vars(dplyr::ends_with("_g")), . %>% ifelse(is.na(.), 0, .)))
```
No missing weeks:
```{r}
missing_weeks(amu)
```
### Chicken data
Below is the code to retrieve data on chicken demographics. In there, we are
computing a variable indicating whether the cycle is finisheed.
```{r}
(chicken <- surveillance[chicks] %>%
unname() %>% # because of the do.call() call that follows
c(list(c("USUBJID", "MID_INOUT_SEQ"))) %>%
do.call(dplyr::left_join, .) %>%
dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK, CHICKENTOTAL, CHICKENEXTOTAL, MID_INOUT_GridEvent_SEQ) %>%
# CORRECTION:
assign("tmp", ., 1) %>% # for reuse 3 lines below
dplyr::filter(USUBJID == "75-070", FLOCKSEQUENCE == "08", MID_INOUT_GridEvent_SEQ == 6) %>%
dplyr::mutate(WEEK = 6) %>%
dplyr::bind_rows(dplyr::filter(tmp, !(USUBJID == "75-070" & FLOCKSEQUENCE == "08" & MID_INOUT_GridEvent_SEQ == 6))) %>%
dplyr::select(-MID_INOUT_GridEvent_SEQ) %>%
# (end of correction)
dplyr::mutate_at(c("CHICKENTOTAL", "CHICKENEXTOTAL"), as.integer) %>%
dplyr::mutate_at("CHICKENEXTOTAL", tidyr::replace_na, 0) %>%
assign("tmp", ., 1) %>% # for reuse on the final line
dplyr::group_by(USUBJID, FLOCKSEQUENCE) %>% # | computing whether...
dplyr::summarize(completed = min(CHICKENTOTAL) < 1) %>% # | ... the cycle is...
dplyr::ungroup() %>% # | ... finished.
dplyr::right_join(tmp, c("USUBJID", "FLOCKSEQUENCE")))
```
No missing week:
```{r}
missing_weeks(chicken)
```
## Merging into one single data frame
Here, we merge the 4 data frames that were computed above: `samples`,
`clinical_signs`, `amu` and `chicken`. First, we need to check that there is no
incompatibility between the `amu`, `clinical_signs` and `chicken` data frames:
```{r}
dplyr::anti_join(amu, clinical_signs, c("USUBJID", "FLOCKSEQUENCE", "WEEK")) %>% dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK)
dplyr::anti_join(clinical_signs, amu, c("USUBJID", "FLOCKSEQUENCE", "WEEK")) %>% dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK)
dplyr::anti_join(chicken, clinical_signs, c("USUBJID", "FLOCKSEQUENCE", "WEEK")) %>% dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK)
dplyr::anti_join(clinical_signs, chicken, c("USUBJID", "FLOCKSEQUENCE", "WEEK")) %>% dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK)
dplyr::anti_join(amu, chicken, c("USUBJID", "FLOCKSEQUENCE", "WEEK")) %>% dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK)
dplyr::anti_join(chicken, amu, c("USUBJID", "FLOCKSEQUENCE", "WEEK")) %>% dplyr::select(USUBJID, FLOCKSEQUENCE, WEEK)
```
The week `17` of flock `05` of the farm `75-004` should be removed from the `amu`
data set:
```{r}
amu %>%
dplyr::filter(USUBJID == "75-004", FLOCKSEQUENCE == "05", WEEK == 17) %>%
dplyr::select(-USUBJID, -FLOCKSEQUENCE, -WEEK) %>%
unlist() %>%
unique()
```
Let's remove it:
```{r}
amu %<>% dplyr::filter(!(USUBJID == "75-004" & FLOCKSEQUENCE == "05" & WEEK == 17))
```
Furthermore, is seems that there is no data for the week `18` of the flock `06`
of the farm `75-077` in the `amu` and `clinical_signs` data frame.
```{r}
nrow(amu)
nrow(chicken)
nrow(clinical_signs)
```
We are ready for the merging and writing to disk:
```{r}
if (!dir.exists("data")) dir.create("data")
list(chicken, clinical_signs, amu, samples) %>% # chicken needs to be first because it's the only df with no NA
purrr::reduce(dplyr::left_join, by = c("USUBJID", "FLOCKSEQUENCE", "WEEK")) %>%
dplyr::mutate(sampling = ! is.na(sampling)) %>% # because in samples there are only sampled weeks
dplyr::mutate_at(dplyr::vars(FLOCKSEQUENCE, WEEK), as.integer) %>%
dplyr::arrange(USUBJID, FLOCKSEQUENCE, WEEK) %>%
dplyr::rename(farm = USUBJID,
flock = FLOCKSEQUENCE,
week = WEEK,
respiratory = RESPIRATORY,
diarrhoea = DIARRHOEA,
cns = CNS,
malaise = MALAISE,
leg_lesions = LEGLESIONS,
sudden_death = SUDDENDEATH,
nb_chicken = CHICKENTOTAL,
chicken_disease_death = CHICKENDISEASEDEATH,
chicken_sudden_death = CHICKENSUDDENDEATH,
nb_chicken_sold = CHICKENEXTOTAL) %>%
dplyr::select(farm, flock, week, sampling, completed, nb_chicken, nb_chicken_sold, chicken_disease_death, chicken_sudden_death, dplyr::everything()) %>%
write.csv("data/viparc.csv", FALSE, row.names = FALSE)
```
Note that, among the `clinical_signs`, `amu`, `chicken` and `samples` data frames,
`samples` is the only one that contains rows only for the weeks where samples
are taken. The three other data frames contain rows for all the weeks of all the
flocks of all the farms. For this reason, after the merging, we have to replace
the missing values in the `sampling` variable by `FALSE`.