This is a Covid-19 data set describing day-level covid-19 cases from 215 countries.

Author/Distributor: SRK Version/Date: V51; April 2, 2020

Source: Kaggle https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset

Load libraries & data

# load library
require(tidyverse)
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ----------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts -------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# show working directory
getwd()
## [1] "C:/Users/lai/Dropbox/Courses/bio-47120-2020"
# load data
x <- read_csv("http://diverge.hunter.cuny.edu/~weigang/covid_19_data.csv")
## Parsed with column specification:
## cols(
##   SNo = col_double(),
##   ObservationDate = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Last_Update = col_character(),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double()
## )
# show summaries
summary(x)
##       SNo        ObservationDate    Province_State     Country_Region    
##  Min.   :    1   Length:11299       Length:11299       Length:11299      
##  1st Qu.: 2826   Class :character   Class :character   Class :character  
##  Median : 5650   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 5650                                                           
##  3rd Qu.: 8474                                                           
##  Max.   :11299                                                           
##  Last_Update          Confirmed          Deaths           Recovered      
##  Length:11299       Min.   :     0   Min.   :    0.00   Min.   :    0.0  
##  Class :character   1st Qu.:     3   1st Qu.:    0.00   1st Qu.:    0.0  
##  Mode  :character   Median :    36   Median :    0.00   Median :    0.0  
##                     Mean   :  1111   Mean   :   47.24   Mean   :  309.1  
##                     3rd Qu.:   252   3rd Qu.:    2.00   3rd Qu.:   16.0  
##                     Max.   :115242   Max.   :13915.00   Max.   :63471.0
glimpse(x)
## Observations: 11,299
## Variables: 8
## $ SNo             <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ ObservationDate <chr> "01/22/2020", "01/22/2020", "01/22/2020", "01/22/20...
## $ Province_State  <chr> "Anhui", "Beijing", "Chongqing", "Fujian", "Gansu",...
## $ Country_Region  <chr> "Mainland China", "Mainland China", "Mainland China...
## $ Last_Update     <chr> "1/22/2020 17:00", "1/22/2020 17:00", "1/22/2020 17...
## $ Confirmed       <dbl> 1, 14, 6, 1, 0, 26, 2, 1, 4, 1, 0, 5, 0, 444, 4, 0,...
## $ Deaths          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0,...
## $ Recovered       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 0, 0,...

Analysis of confirmed cases

names(x)
## [1] "SNo"             "ObservationDate" "Province_State"  "Country_Region" 
## [5] "Last_Update"     "Confirmed"       "Deaths"          "Recovered"
x <- x %>% mutate(date = as.Date(ObservationDate, "%m/%d/%y"))

# select columns and count totals by country & date
x.confirmed <- x %>% select(c(4,6,9)) %>% group_by(date, Country_Region) %>% summarise(confirmed.day.country = sum(Confirmed))

Top 10 countries of confirmed cases

# count totals by country; since it is cumulative, so take the max(daily case) 
x.total.confirmed <- x.confirmed %>% group_by(Country_Region) %>% summarise(confirmed.country = max(confirmed.day.country))

# get top 10
x.top.confirmed <- x.total.confirmed %>% arrange(desc(confirmed.country)) %>% head(10)

# make bar plot
x.top.confirmed %>% ggplot(aes(x = Country_Region, y = confirmed.country)) + geom_bar(stat = "identity") + theme_bw()

# replot with sorted bars
x.top.confirmed %>% ggplot(aes(x = reorder(Country_Region, confirmed.country), y = confirmed.country)) + geom_bar(stat = "identity") + theme_bw()

# flip coordinates
x.top.confirmed %>% ggplot(aes(x = reorder(Country_Region, confirmed.country), y = confirmed.country)) + geom_bar(stat = "identity") + theme_bw() + coord_flip()

# plot daily confirmed cases
x.confirmed %>% filter(Country_Region %in% x.top.confirmed$Country_Region) %>% ggplot(aes(x = date, y = confirmed.day.country, group = Country_Region, color = Country_Region)) + geom_line() + theme_bw()

# replot in log10 scale
x.confirmed %>% filter(Country_Region %in% x.top.confirmed$Country_Region) %>% ggplot(aes(x = date, y = confirmed.day.country, group = Country_Region, color = Country_Region)) + geom_line() + theme_bw() + scale_y_log10() 

# correlation analysis between confirmed and death

# number of death by date and country
x.death <- x %>% select(c(4,7,9)) %>% group_by(date, Country_Region) %>% summarise(death.country.date = sum(Deaths))

# number of death by country
x.total.death <- x.death %>% group_by(Country_Region) %>% summarise(death.country = sum(death.country.date))

# join confirmed and death tables
x2 <- x.total.death %>% left_join(x.total.confirmed, by = "Country_Region")

# add a column for mortality
x2 <- x2 %>% mutate(mortality = death.country/confirmed.country * 100)

# make scatterplot
x2 %>% ggplot(aes(x=confirmed.country, y=death.country)) + geom_point(shape=1) + theme_bw()

# replot in log10 scale
x2 %>% ggplot(aes(x=confirmed.country, y=death.country)) + geom_point(shape=1) + theme_bw()+ scale_y_log10() + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous y-axis

# fix zero cases and add regression line
x2 %>% ggplot(aes(x=confirmed.country+1, y=death.country+1)) + geom_point(shape=1) + theme_bw()+ scale_y_log10() + scale_x_log10() + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

# highlight top 10 countries
x2.top <- x2 %>% filter(x2$Country_Region %in% x.top.confirmed$Country_Region)

ggplot(data = x2, aes(x=confirmed.country+1, y=death.country+1)) + geom_point(shape=1) + theme_bw()+ scale_y_log10() + scale_x_log10() + geom_smooth(method = "lm") + geom_point(data = x2.top, aes(x=confirmed.country+1, y=death.country+1, color = Country_Region), size=4) + xlab("num cumulative confirmed cases") + ylab("num cumulative death")
## `geom_smooth()` using formula 'y ~ x'

# Obtain p values
lm.covid <- lm(death.country ~ confirmed.country, data = x2)
summary(lm.covid)
## 
## Call:
## lm(formula = death.country ~ confirmed.country, data = x2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -82333   -546   -355   -336 114778 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       334.18170  832.72121   0.401    0.689    
## confirmed.country   0.45547    0.03712  12.271   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11940 on 213 degrees of freedom
## Multiple R-squared:  0.4141, Adjusted R-squared:  0.4114 
## F-statistic: 150.6 on 1 and 213 DF,  p-value: < 2.2e-16
lm.covid.log <- lm(log10(death.country+1) ~ log10(confirmed.country+1), data = x2)
summary(lm.covid.log)
## 
## Call:
## lm(formula = log10(death.country + 1) ~ log10(confirmed.country + 
##     1), data = x2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.74028 -0.32265  0.06331  0.40002  1.37291 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -0.70811    0.07605  -9.311   <2e-16 ***
## log10(confirmed.country + 1)  0.91982    0.03133  29.361   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5552 on 213 degrees of freedom
## Multiple R-squared:  0.8019, Adjusted R-squared:  0.8009 
## F-statistic: 862.1 on 1 and 213 DF,  p-value: < 2.2e-16

US studies

# filter US confirmed cases
x.us <- x %>% filter(Country_Region == 'US') %>% select(Province_State, Confirmed, date)

# get top 10
x.region <- x.us %>% group_by(Province_State) %>% summarise(confirmed.region = max(Confirmed)) 

x.region.top <- x.region %>% arrange(desc(confirmed.region)) %>% head(10)

# make sorted bar plot
x.region.top %>% ggplot(aes(x = reorder(Province_State, confirmed.region), y = confirmed.region)) + geom_bar(stat = "identity") + theme_bw() + coord_flip()

# plot daily cases
x.us %>% filter(Province_State %in% x.region.top$Province_State) %>% ggplot(aes(x = date, y = Confirmed + 1, group = Province_State, color = Province_State)) + geom_line() + theme_bw() + scale_y_log10() 

To prepare for next week’s quiz

  1. Repeat the above code
  2. Replicate for cases of “Deaths” of “Recovery”
  3. Review Self-study-2 & Self-study-3