This is an RNA-Seq data set comparing gene expression levels (i.e., number of reads for each gene) of a knockout yeast strain (“SNF”) versus a wild type (“WT”) strain. Five replicate RNA extracts were performed for each strain. It is submited by Junho & based on a study by Gierlinski et al (2015): https://academic.oup.com/bioinformatics/article/31/22/3625/240923
# load library
require(tidyverse)
## Loading required package: tidyverse
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## -- Attaching packages ------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.2
## v tibble 2.1.2 v dplyr 0.8.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# show working directory
getwd()
## [1] "C:/Users/lai/Dropbox/Courses/bio-47120-2020"
# load data
x <- read_csv("http://diverge.hunter.cuny.edu/~weigang/yeast2.csv")
## Parsed with column specification:
## cols(
## Geneid = col_character(),
## Chr = col_character(),
## Start = col_character(),
## End = col_character(),
## Strand = col_character(),
## Length = col_double(),
## SNF2_1 = col_double(),
## SNF2_2 = col_double(),
## SNF2_3 = col_double(),
## SNF2_4 = col_double(),
## SNF2_5 = col_double(),
## WT_1 = col_double(),
## WT_2 = col_double(),
## WT_3 = col_double(),
## WT_4 = col_double(),
## WT_5 = col_double()
## )
# show summaries
summary(x)
## Geneid Chr Start
## Length:7126 Length:7126 Length:7126
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## End Strand Length SNF2_1
## Length:7126 Length:7126 Min. : 51 Min. : 0
## Class :character Class :character 1st Qu.: 453 1st Qu.: 111
## Mode :character Mode :character Median : 1012 Median : 526
## Mean : 1285 Mean : 1337
## 3rd Qu.: 1722 3rd Qu.: 1203
## Max. :14733 Max. :170505
## SNF2_2 SNF2_3 SNF2_4 SNF2_5
## Min. : 0.0 Min. : 0 Min. : 1 Min. : 2
## 1st Qu.: 78.0 1st Qu.: 76 1st Qu.: 77 1st Qu.: 78
## Median : 406.0 Median : 399 Median : 400 Median : 401
## Mean : 1127.1 Mean : 1137 Mean : 1138 Mean : 1139
## 3rd Qu.: 953.8 3rd Qu.: 946 3rd Qu.: 947 3rd Qu.: 948
## Max. :163536.0 Max. :173245 Max. :173246 Max. :173247
## WT_1 WT_2 WT_3
## Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 39.0 1st Qu.: 69.0 1st Qu.: 59.0
## Median : 217.0 Median : 360.0 Median : 289.0
## Mean : 757.2 Mean : 1012.8 Mean : 827.9
## 3rd Qu.: 546.8 3rd Qu.: 839.8 3rd Qu.: 661.8
## Max. :159252.0 Max. :165245.0 Max. :143939.0
## WT_4 WT_5
## Min. : 0.0 Min. : 0
## 1st Qu.: 82.0 1st Qu.: 65
## Median : 417.0 Median : 344
## Mean : 1332.3 Mean : 1023
## 3rd Qu.: 994.5 3rd Qu.: 803
## Max. :271082.0 Max. :187430
glimpse(x)
## Observations: 7,126
## Variables: 16
## $ Geneid <chr> "YDL248W", "YDL247W-A", "YDL247W", "YDL246C", "YDL245C"...
## $ Chr <chr> "IV", "IV", "IV", "IV", "IV", "IV", "IV", "IV", "IV", "...
## $ Start <chr> "1802", "3762", "5985", "8683", "11657", "16204", "1757...
## $ End <chr> "2953", "3836", "7814", "9756", "13360", "17226", "1856...
## $ Strand <chr> "+", "+", "+", "-", "-", "+", "-", "+", "+", "-", "+", ...
## $ Length <dbl> 1152, 75, 1830, 1074, 1704, 1023, 990, 354, 372, 138, 3...
## $ SNF2_1 <dbl> 109, 0, 6, 6, 1, 79, 363, 41, 143, 2, 670, 243, 381, 10...
## $ SNF2_2 <dbl> 84, 1, 6, 6, 6, 59, 289, 22, 119, 4, 470, 163, 335, 882...
## $ SNF2_3 <dbl> 100, 1, 1, 1, 9, 49, 243, 26, 115, 1, 429, 232, 302, 91...
## $ SNF2_4 <dbl> 101, 2, 2, 2, 10, 50, 244, 27, 116, 2, 430, 233, 303, 9...
## $ SNF2_5 <dbl> 102, 3, 3, 3, 11, 51, 245, 28, 117, 3, 431, 234, 304, 9...
## $ WT_1 <dbl> 47, 0, 2, 1, 6, 9, 192, 25, 239, 2, 185, 105, 305, 435,...
## $ WT_2 <dbl> 65, 0, 3, 3, 2, 8, 169, 30, 343, 0, 493, 161, 231, 636,...
## $ WT_3 <dbl> 60, 1, 4, 2, 5, 12, 190, 18, 251, 1, 326, 169, 279, 622...
## $ WT_4 <dbl> 95, 0, 7, 4, 5, 30, 309, 42, 555, 3, 432, 204, 444, 966...
## $ WT_5 <dbl> 43, 0, 9, 0, 6, 14, 171, 27, 323, 1, 506, 156, 179, 561...
names(x)
## [1] "Geneid" "Chr" "Start" "End" "Strand" "Length" "SNF2_1"
## [8] "SNF2_2" "SNF2_3" "SNF2_4" "SNF2_5" "WT_1" "WT_2" "WT_3"
## [15] "WT_4" "WT_5"
# plot histogram of FPKM (add 1 to avoid 0)
x %>% ggplot(aes(x=SNF2_1+1)) + geom_histogram(bins = 10) + xlab("read counts") + ylab("Num of genes") + theme_bw()
# Make normal by log tranformation
x %>% ggplot(aes(x=SNF2_1+1)) + geom_histogram(bins = 10) + xlab("read counts") + ylab("Num of genes") + scale_x_log10() + theme_bw()
# compare SNF vs Wildtype
x %>% ggplot(aes(x=SNF2_2, y= WT_1)) + geom_point() + theme_bw()
# log transform, add 1 to avoid 0
x %>% ggplot(aes(x=SNF2_2+1, y= WT_1+1)) + geom_point() + theme_bw() + scale_x_log10() + scale_y_log10()
# add a 1:1 line
x %>% ggplot(aes(x=SNF2_2+1, y= WT_1+1)) + geom_point() + theme_bw() + scale_x_log10() + scale_y_log10() + geom_abline(slope = 1, intercept = 0)
# create two log2 columns
x <- x %>% mutate(log.snf = log2(SNF2_1+1), log.wt = log2(WT_1+1))
# plot Fold change vs sum of expression levels
x %>% ggplot(aes(x=log.snf + log.wt, y=log.snf - log.wt)) + geom_point() + theme_bw()
# add a baseline, modify data shape
x %>% ggplot(aes(x=log.snf + log.wt, y=log.snf - log.wt)) + geom_point(color = "gray", shape =1) + xlab("total expression") + ylab("fold change") + geom_hline(yintercept = 0, color = "red") + theme_bw()