This is a study to identify genes involved in tissue regeneration, based on measurements of transcriptome, proteome, and metabolomics in zebrafish.
Citation: Rabinowitz et al (2017). “Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish”
Paper Source: https://www.pnas.org/content/114/5/E717
Table 1. Overlap between RNAseq & LFQ
Fig 3
Early regneration: Fig 5 (S15, S22, S23)
Your final report should include the following required components: 1. Define biological questions (the overall goal of the study and the specific question addressed by your dataset) 2. Summarize experimental design & samples how the data was generated (e.g., sample, sample size, control, sequencing technology). Hint: Fig S1 3. Show R codes for replicating the visualization 4. Perform at least one statistical analysis. Show mull hypothesis and p-value. Draw statistical conclusion 5. Summarize biological conclusion & future work 6. Include full citations to paper & dataset
# load library
require(tidyverse)
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts --------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggrepel) # avoid overlapping labels
## Warning: package 'ggrepel' was built under R version 3.6.3
# show working directory
getwd()
## [1] "C:/Users/lai/Dropbox/Courses/bio-47120-2020"
# load data
x <- read_tsv("http://diverge.hunter.cuny.edu/~weigang/Rabinowitz-etal-data-sets/sd01.tsv")
## Parsed with column specification:
## cols(
## id = col_character(),
## symbol = col_character(),
## name = col_character(),
## Proximal = col_double(),
## Distal = col_double(),
## foldChange = col_double(),
## log2FoldChange = col_double(),
## pval = col_double(),
## FDR = col_double()
## )
# show summaries
summary(x)
## id symbol name Proximal
## Length:23404 Length:23404 Length:23404 Min. : 0.0
## Class :character Class :character Class :character 1st Qu.: 13.3
## Mode :character Mode :character Mode :character Median : 86.2
## Mean : 771.2
## 3rd Qu.: 375.6
## Max. :2545959.2
## Distal foldChange log2FoldChange pval
## Min. : 0.0 Min. : 0.0 Min. :-17.05302 Min. :0.000000
## 1st Qu.: 11.0 1st Qu.: 0.7 1st Qu.: -0.46061 1st Qu.:0.001344
## Median : 84.7 Median : 1.0 Median : -0.04386 Median :0.103543
## Mean : 736.5 Mean : 117.8 Mean : -0.13567 Mean :0.268205
## 3rd Qu.: 388.6 3rd Qu.: 1.3 3rd Qu.: 0.34747 3rd Qu.:0.495860
## Max. :1525588.9 Max. :2407587.8 Max. : 21.19916 Max. :1.000000
## FDR
## Min. :0.000000
## 1st Qu.:0.005378
## Median :0.207078
## Mean :0.341513
## 3rd Qu.:0.661116
## Max. :1.000000
glimpse(x)
## Observations: 23,404
## Variables: 9
## $ id <chr> "ENSDARG00000000001", "ENSDARG00000000002", "ENSDARG...
## $ symbol <chr> "slc35a5", "ccdc80", "nrf1", "ube2h", "slc9a3r1", "d...
## $ name <chr> "solute carrier family 35, member A5", "coiled-coil ...
## $ Proximal <dbl> 52.311723, 222.734126, 494.990919, 392.383616, 242.0...
## $ Distal <dbl> 95.564719, 271.913235, 365.565522, 563.164795, 253.1...
## $ foldChange <dbl> 1.8268160, 1.2207964, 0.7385303, 1.4352392, 1.045954...
## $ log2FoldChange <dbl> 0.86933131, 0.28782259, -0.43727102, 0.52129123, 0.0...
## $ pval <dbl> 8.270000e-06, 3.129820e-02, 3.891670e-04, 1.510000e-...
## $ FDR <dbl> 5.460000e-05, 7.892501e-02, 1.782398e-03, 9.490000e-...
names(x)
## [1] "id" "symbol" "name" "Proximal"
## [5] "Distal" "foldChange" "log2FoldChange" "pval"
## [9] "FDR"
# check data normality (histogram)
x %>% ggplot(aes(x=log2FoldChange)) + geom_histogram(bins = 100)
# apply log10 scale on p values & FDR (false discovery rate)
x %>% ggplot(aes(x=-log10(FDR))) + geom_histogram(bins = 100)
## Warning: Removed 2 rows containing non-finite values (stat_bin).
# add a column on -log10(p)
x <- x %>% mutate(log.p = -log10(pval), sig = if_else(FDR < 0.01, T, F))
# make volcano plot
p <- x %>% ggplot(aes(x=log2FoldChange, y=log.p, color = sig)) + geom_point(shape=1) + theme_bw()
p
# pick genes known to have positional memory
pick <- c("ald1l1", "hapln1a", "agr2", "aldh1a2", "bco1", "and1", "ca2", "and2")
x.pick <- x %>% filter(symbol %in% pick)
# add genes to volcano plot
p + geom_text(data = x.pick, aes(x=log2FoldChange, y=log.p, label=symbol), color = "red", vjust = "outward", hjust = "outward")
# read S4 data set
s4 <- read_tsv("http://diverge.hunter.cuny.edu/~weigang/Rabinowitz-etal-data-sets/sd04.tsv")
## Parsed with column specification:
## cols(
## id = col_character(),
## symbol = col_character(),
## name = col_character(),
## Proximal = col_double(),
## Distal = col_double(),
## foldChange = col_double(),
## log2FoldChange = col_double(),
## pval = col_double(),
## FDR = col_double(),
## Score = col_double(),
## `ABS score` = col_double()
## )
# select columns and add column
x <- s4 %>% select(symbol, Proximal, Distal) %>% mutate(log.prox = log2(Proximal+1), log.dist = log2(Distal+1))
# create a matrix
x.mat <- as.matrix(data.frame(prox = x$log.prox, dist = x$log.dist))
# plot heatmap
heatmap(x.mat, Colv = NA)
Take the following columns for Fig 1D 21-24: distal 1 25-27: distal 2 28-30: distal region 1 31-33: distal region 3 34-37: distal# 38-41: distal 4 42-45: proximal 1 46-48: proximal 2 49-51: proximal region 1 52-54: proximal region 3 55-58: proximal * 59-62: middle 3 63-66: proximal 3 67-70: ventral 71-74: center 75-78: dosal
# read dataset S13 & select columns
x <- read_tsv("http://diverge.hunter.cuny.edu/~weigang/Rabinowitz-etal-data-sets/sd13.tsv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Uniprot = col_character(),
## `Gene Name` = col_character(),
## `Fasta header` = col_character()
## )
## See spec(...) for full column specifications.
x.exp <- x %>% select(1,21:78)
# make a long/tiday table
x.long <- x.exp %>% gather(2:59, key = "body.position", value = "protein.amount")
# histogram to check normality
x.long %>% ggplot(aes(x=protein.amount)) + geom_histogram(bins = 100) + theme_bw()
# violin plot to show distribution by body positions
x.long %>% ggplot(aes(x=body.position, y=protein.amount)) + geom_violin() + coord_flip() + theme_bw()
# PCA analysis
## select columns (individual experiments with technical replicates) and create a matrix
x.mat <- x.exp %>% select(2:59) %>% as.matrix()
# transpose matrix (columns to rows & rows to columns)
x.mat2 <- t(x.mat) # we want to compare positions, not genes
# perform PCA
x.pca <- prcomp(x.mat2)
summary(x.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 37.6138 26.0849 18.91785 15.24832 11.84763 11.02318
## Proportion of Variance 0.2776 0.1335 0.07023 0.04563 0.02755 0.02385
## Cumulative Proportion 0.2776 0.4112 0.48140 0.52703 0.55458 0.57842
## PC7 PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 9.46031 8.9732 8.88011 8.57090 8.20800 7.64775 7.58476
## Proportion of Variance 0.01756 0.0158 0.01547 0.01442 0.01322 0.01148 0.01129
## Cumulative Proportion 0.59599 0.6118 0.62726 0.64168 0.65490 0.66638 0.67767
## PC14 PC15 PC16 PC17 PC18 PC19 PC20
## Standard deviation 7.46600 7.19751 7.16385 7.07330 7.01476 6.90376 6.82872
## Proportion of Variance 0.01094 0.01017 0.01007 0.00982 0.00966 0.00935 0.00915
## Cumulative Proportion 0.68861 0.69877 0.70884 0.71866 0.72832 0.73767 0.74682
## PC21 PC22 PC23 PC24 PC25 PC26 PC27
## Standard deviation 6.75090 6.66968 6.6212 6.57412 6.54046 6.46843 6.40096
## Proportion of Variance 0.00894 0.00873 0.0086 0.00848 0.00839 0.00821 0.00804
## Cumulative Proportion 0.75577 0.76450 0.7731 0.78158 0.78998 0.79819 0.80623
## PC28 PC29 PC30 PC31 PC32 PC33 PC34
## Standard deviation 6.3849 6.30711 6.26984 6.22913 6.18961 6.16581 6.07217
## Proportion of Variance 0.0080 0.00781 0.00771 0.00761 0.00752 0.00746 0.00724
## Cumulative Proportion 0.8142 0.82203 0.82975 0.83736 0.84488 0.85234 0.85958
## PC35 PC36 PC37 PC38 PC39 PC40 PC41
## Standard deviation 6.04587 6.00395 5.94348 5.85848 5.84834 5.79324 5.77431
## Proportion of Variance 0.00717 0.00707 0.00693 0.00674 0.00671 0.00659 0.00654
## Cumulative Proportion 0.86675 0.87383 0.88076 0.88749 0.89420 0.90079 0.90733
## PC42 PC43 PC44 PC45 PC46 PC47 PC48
## Standard deviation 5.75971 5.69706 5.6672 5.60399 5.59575 5.54620 5.48810
## Proportion of Variance 0.00651 0.00637 0.0063 0.00616 0.00614 0.00604 0.00591
## Cumulative Proportion 0.91384 0.92021 0.9265 0.93268 0.93882 0.94486 0.95077
## PC49 PC50 PC51 PC52 PC53 PC54 PC55
## Standard deviation 5.46679 5.40685 5.3911 5.36349 5.26753 5.25946 5.22093
## Proportion of Variance 0.00586 0.00574 0.0057 0.00565 0.00545 0.00543 0.00535
## Cumulative Proportion 0.95664 0.96237 0.9681 0.97372 0.97917 0.98460 0.98994
## PC56 PC57 PC58
## Standard deviation 5.06840 5.05464 5.894e-14
## Proportion of Variance 0.00504 0.00501 0.000e+00
## Cumulative Proportion 0.99499 1.00000 1.000e+00
plot(x.pca) # get variances distribution
# obtain coordinates
x.pca.coord <- as.data.frame(x.pca[[5]])
# transform sample names (for later use)
sample.names <- rownames(x.pca.coord)
## groups for each sample
sample.names2 <- str_remove(sample.names, "_\\d+") %>% str_remove("\\#\\d")
## groups for each position
sample.names3 <- sample.names2 %>% str_remove("_.+")
# plot 1st & 2nd principal components
x.pca.coord %>% ggplot(aes(x=PC1, y = PC2, label = sample.names)) + geom_point(shape= 1) + geom_text() + theme_bw()
# replot with geom_text_repel
x.pca.coord %>% ggplot(aes(x=PC1, y = PC2, label = sample.names)) + geom_point(shape= 1) + geom_text_repel() + theme_bw()
# color by body positions
x.pca.coord %>% ggplot(aes(x=PC1, y = PC2, color = sample.names3, label = sample.names3)) + geom_point(size=3) + theme_bw()