This is a study of incubation time for COVID-19 infections, based on 468 events of transmission in China: https://wwwnc.cdc.gov/eid/article/26/6/20-0357_article. Main research questions include: (1) Are there pre-symptomatic transmissions? (2) What is the mean incubation time? (3) Does the incubation time differ for same-city transmission? (4) Is the incubation time shorter for same-household transmission? We will replicate main study results.
# load library
require(tidyverse)
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ----------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# show working directory
getwd()
## [1] "C:/Users/lai/Dropbox/Courses/bio-47120-2020"
# load data
x <- read_csv("http://diverge.hunter.cuny.edu/~weigang/covid-19-cases.csv2")
## Parsed with column specification:
## cols(
## Event_index = col_double(),
## Index_ID = col_double(),
## Secondary_ID = col_double(),
## City = col_character(),
## Province = col_character(),
## Infection_location = col_character(),
## Symptom_onset_date = col_double(),
## Age = col_double(),
## Sex = col_character(),
## Secondary_infection_location = col_character(),
## Seconday_symptom_onset_date = col_double(),
## Seconday_Age = col_double(),
## Seconday_Sex = col_character(),
## Contact_type = col_character(),
## URL = col_character(),
## Data_source = col_character()
## )
# show summaries
summary(x)
## Event_index Index_ID Secondary_ID City
## Min. : 1.0 Min. : 22 Min. : 26 Length:468
## 1st Qu.:117.8 1st Qu.:1047 1st Qu.:1067 Class :character
## Median :234.5 Median :2232 Median :2290 Mode :character
## Mean :234.5 Mean :2675 Mean :2703
## 3rd Qu.:351.2 3rd Qu.:4023 3rd Qu.:4046
## Max. :468.0 Max. :6811 Max. :6813
##
## Province Infection_location Symptom_onset_date Age
## Length:468 Length:468 Min. :-3.0 Min. : 7.00
## Class :character Class :character 1st Qu.:12.0 1st Qu.:36.00
## Mode :character Mode :character Median :15.0 Median :46.00
## Mean :14.6 Mean :46.14
## 3rd Qu.:18.0 3rd Qu.:55.00
## Max. :29.0 Max. :88.00
## NA's :11
## Sex Secondary_infection_location Seconday_symptom_onset_date
## Length:468 Length:468 Min. : 0.00
## Class :character Class :character 1st Qu.:15.00
## Mode :character Mode :character Median :19.00
## Mean :18.56
## 3rd Qu.:22.00
## Max. :30.00
##
## Seconday_Age Seconday_Sex Contact_type URL
## Min. : 1.0 Length:468 Length:468 Length:468
## 1st Qu.:32.0 Class :character Class :character Class :character
## Median :46.5 Mode :character Mode :character Mode :character
## Mean :45.1
## 3rd Qu.:59.0
## Max. :90.0
## NA's :10
## Data_source
## Length:468
## Class :character
## Mode :character
##
##
##
##
glimpse(x)
## Observations: 468
## Variables: 16
## $ Event_index <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...
## $ Index_ID <dbl> 23, 23, 28, 35, 22, 49, 62, 60, 60, 63...
## $ Secondary_ID <dbl> 26, 27, 30, 37, 43, 53, 60, 61, 63, 64...
## $ City <chr> "Tianjin", "Tianjin", "Tianjin", "Tian...
## $ Province <chr> "Tianjin", "Tianjin", "Tianjin", "Tian...
## $ Infection_location <chr> "Tianjin", "Tianjin", "Xiaogan", "Tian...
## $ Symptom_onset_date <dbl> 16, 16, 15, 16, 14, 18, 15, 17, 17, 11...
## $ Age <dbl> 47, 47, 61, 49, 57, 35, 43, 52, 52, 35...
## $ Sex <chr> "Male", "Male", "Female", "Male", "Mal...
## $ Secondary_infection_location <chr> "Tianjin", "Tianjin", "Tianjin", "Tian...
## $ Seconday_symptom_onset_date <dbl> 20, 20, 21, 20, 16, 22, 17, 18, 11, 19...
## $ Seconday_Age <dbl> 74, 75, 61, 48, 90, 66, 52, 53, 35, 26...
## $ Seconday_Sex <chr> "Female", "Male", "Male", "Female", "M...
## $ Contact_type <chr> "Non-household", "Non-household", "Hou...
## $ URL <chr> "http://wsjk.tj.gov.cn/", "http://wsjk...
## $ Data_source <chr> "Tianjin Municipal Health Commission",...
names(x)
## [1] "Event_index" "Index_ID"
## [3] "Secondary_ID" "City"
## [5] "Province" "Infection_location"
## [7] "Symptom_onset_date" "Age"
## [9] "Sex" "Secondary_infection_location"
## [11] "Seconday_symptom_onset_date" "Seconday_Age"
## [13] "Seconday_Sex" "Contact_type"
## [15] "URL" "Data_source"
# make a new column for serial interval
x <- x %>% mutate(serial_int = Seconday_symptom_onset_date - Symptom_onset_date)
# Histogram of serial interval
x %>% ggplot(aes(x=serial_int)) + geom_histogram(bins = 10) + theme_bw()
# Boxplot
x %>% ggplot(aes(x=Contact_type, y= serial_int)) + geom_boxplot() + geom_jitter(shape =1) + theme_bw()
# Violin plot
x %>% ggplot(aes(x=Contact_type, y= serial_int)) + geom_violin() + geom_jitter(shape =1) + theme_bw()
# test difference by contact type
x.contact <- x %>% filter(Contact_type %in% c('Household', "Non-household")) %>% select(serial_int, Contact_type)
t.test(serial_int ~ Contact_type, data = x.contact)
##
## Welch Two Sample t-test
##
## data: serial_int by Contact_type
## t = -0.7595, df = 223.86, p-value = 0.4484
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.6042564 0.7116725
## sample estimates:
## mean in group Household mean in group Non-household
## 4.028846 4.475138
# test differences by city
x <- x %>% mutate(imported = ifelse(Infection_location == Secondary_infection_location, FALSE, TRUE))
x.city <- x %>% select(serial_int, imported)
t.test(serial_int ~ imported, data = x.city)
##
## Welch Two Sample t-test
##
## data: serial_int by imported
## t = -0.82494, df = 260.11, p-value = 0.4102
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.3070142 0.5352291
## sample estimates:
## mean in group FALSE mean in group TRUE
## 3.679389 4.065282