This is a Covid-19 data set describing day-level covid-19 cases from 215 countries.
Author/Distributor: SRK Version/Date: V51; April 2, 2020
Source: Kaggle https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset
# load library
require(tidyverse)
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ----------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts -------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# show working directory
getwd()
## [1] "C:/Users/lai/Dropbox/Courses/bio-47120-2020"
# load data
x <- read_csv("http://diverge.hunter.cuny.edu/~weigang/covid_19_data.csv")
## Parsed with column specification:
## cols(
## SNo = col_double(),
## ObservationDate = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Last_Update = col_character(),
## Confirmed = col_double(),
## Deaths = col_double(),
## Recovered = col_double()
## )
# show summaries
summary(x)
## SNo ObservationDate Province_State Country_Region
## Min. : 1 Length:11299 Length:11299 Length:11299
## 1st Qu.: 2826 Class :character Class :character Class :character
## Median : 5650 Mode :character Mode :character Mode :character
## Mean : 5650
## 3rd Qu.: 8474
## Max. :11299
## Last_Update Confirmed Deaths Recovered
## Length:11299 Min. : 0 Min. : 0.00 Min. : 0.0
## Class :character 1st Qu.: 3 1st Qu.: 0.00 1st Qu.: 0.0
## Mode :character Median : 36 Median : 0.00 Median : 0.0
## Mean : 1111 Mean : 47.24 Mean : 309.1
## 3rd Qu.: 252 3rd Qu.: 2.00 3rd Qu.: 16.0
## Max. :115242 Max. :13915.00 Max. :63471.0
glimpse(x)
## Observations: 11,299
## Variables: 8
## $ SNo <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ ObservationDate <chr> "01/22/2020", "01/22/2020", "01/22/2020", "01/22/20...
## $ Province_State <chr> "Anhui", "Beijing", "Chongqing", "Fujian", "Gansu",...
## $ Country_Region <chr> "Mainland China", "Mainland China", "Mainland China...
## $ Last_Update <chr> "1/22/2020 17:00", "1/22/2020 17:00", "1/22/2020 17...
## $ Confirmed <dbl> 1, 14, 6, 1, 0, 26, 2, 1, 4, 1, 0, 5, 0, 444, 4, 0,...
## $ Deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0,...
## $ Recovered <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 0, 0,...
names(x)
## [1] "SNo" "ObservationDate" "Province_State" "Country_Region"
## [5] "Last_Update" "Confirmed" "Deaths" "Recovered"
x <- x %>% mutate(date = as.Date(ObservationDate, "%m/%d/%y"))
# select columns and count totals by country & date
x.confirmed <- x %>% select(c(4,6,9)) %>% group_by(date, Country_Region) %>% summarise(confirmed.day.country = sum(Confirmed))
# count totals by country; since it is cumulative, so take the max(daily case)
x.total.confirmed <- x.confirmed %>% group_by(Country_Region) %>% summarise(confirmed.country = max(confirmed.day.country))
# get top 10
x.top.confirmed <- x.total.confirmed %>% arrange(desc(confirmed.country)) %>% head(10)
# make bar plot
x.top.confirmed %>% ggplot(aes(x = Country_Region, y = confirmed.country)) + geom_bar(stat = "identity") + theme_bw()
# replot with sorted bars
x.top.confirmed %>% ggplot(aes(x = reorder(Country_Region, confirmed.country), y = confirmed.country)) + geom_bar(stat = "identity") + theme_bw()
# flip coordinates
x.top.confirmed %>% ggplot(aes(x = reorder(Country_Region, confirmed.country), y = confirmed.country)) + geom_bar(stat = "identity") + theme_bw() + coord_flip()
# plot daily confirmed cases
x.confirmed %>% filter(Country_Region %in% x.top.confirmed$Country_Region) %>% ggplot(aes(x = date, y = confirmed.day.country, group = Country_Region, color = Country_Region)) + geom_line() + theme_bw()
# replot in log10 scale
x.confirmed %>% filter(Country_Region %in% x.top.confirmed$Country_Region) %>% ggplot(aes(x = date, y = confirmed.day.country, group = Country_Region, color = Country_Region)) + geom_line() + theme_bw() + scale_y_log10()
# correlation analysis between confirmed and death
# number of death by date and country
x.death <- x %>% select(c(4,7,9)) %>% group_by(date, Country_Region) %>% summarise(death.country.date = sum(Deaths))
# number of death by country
x.total.death <- x.death %>% group_by(Country_Region) %>% summarise(death.country = sum(death.country.date))
# join confirmed and death tables
x2 <- x.total.death %>% left_join(x.total.confirmed, by = "Country_Region")
# add a column for mortality
x2 <- x2 %>% mutate(mortality = death.country/confirmed.country * 100)
# make scatterplot
x2 %>% ggplot(aes(x=confirmed.country, y=death.country)) + geom_point(shape=1) + theme_bw()
# replot in log10 scale
x2 %>% ggplot(aes(x=confirmed.country, y=death.country)) + geom_point(shape=1) + theme_bw()+ scale_y_log10() + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
# fix zero cases and add regression line
x2 %>% ggplot(aes(x=confirmed.country+1, y=death.country+1)) + geom_point(shape=1) + theme_bw()+ scale_y_log10() + scale_x_log10() + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
# highlight top 10 countries
x2.top <- x2 %>% filter(x2$Country_Region %in% x.top.confirmed$Country_Region)
ggplot(data = x2, aes(x=confirmed.country+1, y=death.country+1)) + geom_point(shape=1) + theme_bw()+ scale_y_log10() + scale_x_log10() + geom_smooth(method = "lm") + geom_point(data = x2.top, aes(x=confirmed.country+1, y=death.country+1, color = Country_Region), size=4) + xlab("num cumulative confirmed cases") + ylab("num cumulative death")
## `geom_smooth()` using formula 'y ~ x'
# Obtain p values
lm.covid <- lm(death.country ~ confirmed.country, data = x2)
summary(lm.covid)
##
## Call:
## lm(formula = death.country ~ confirmed.country, data = x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82333 -546 -355 -336 114778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 334.18170 832.72121 0.401 0.689
## confirmed.country 0.45547 0.03712 12.271 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11940 on 213 degrees of freedom
## Multiple R-squared: 0.4141, Adjusted R-squared: 0.4114
## F-statistic: 150.6 on 1 and 213 DF, p-value: < 2.2e-16
lm.covid.log <- lm(log10(death.country+1) ~ log10(confirmed.country+1), data = x2)
summary(lm.covid.log)
##
## Call:
## lm(formula = log10(death.country + 1) ~ log10(confirmed.country +
## 1), data = x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.74028 -0.32265 0.06331 0.40002 1.37291
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.70811 0.07605 -9.311 <2e-16 ***
## log10(confirmed.country + 1) 0.91982 0.03133 29.361 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5552 on 213 degrees of freedom
## Multiple R-squared: 0.8019, Adjusted R-squared: 0.8009
## F-statistic: 862.1 on 1 and 213 DF, p-value: < 2.2e-16
# filter US confirmed cases
x.us <- x %>% filter(Country_Region == 'US') %>% select(Province_State, Confirmed, date)
# get top 10
x.region <- x.us %>% group_by(Province_State) %>% summarise(confirmed.region = max(Confirmed))
x.region.top <- x.region %>% arrange(desc(confirmed.region)) %>% head(10)
# make sorted bar plot
x.region.top %>% ggplot(aes(x = reorder(Province_State, confirmed.region), y = confirmed.region)) + geom_bar(stat = "identity") + theme_bw() + coord_flip()
# plot daily cases
x.us %>% filter(Province_State %in% x.region.top$Province_State) %>% ggplot(aes(x = date, y = Confirmed + 1, group = Province_State, color = Province_State)) + geom_line() + theme_bw() + scale_y_log10()