This is a study to identify genes involved in tissue regeneration, based on measurements of transcriptome, proteome, and metabolomics in zebrafish.

Citation: Rabinowitz et al (2017). “Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish”

Paper Source: https://www.pnas.org/content/114/5/E717

Data sets:

  1. RNA-seq (n=23,926 transcripts): S1-S12
  2. Proteome/LFQ (n=3061 proteins): S12-S25
  3. Metabolomics (Mass spectrometry): S26-S29 (Fig 4)

Figures & Tables to replicate:

For final report, you are required to:

  1. Read the paper and identify a dataset to replicate
  2. Create an R markdown file to record your work
  3. Produce a final WORD or PDF file as final report
  4. Make a 10 minute presentation based on the report.

Your final report should include the following required components: 1. Define biological questions (the overall goal of the study and the specific question addressed by your dataset) 2. Summarize experimental design & samples how the data was generated (e.g., sample, sample size, control, sequencing technology). Hint: Fig S1 3. Show R codes for replicating the visualization 4. Perform at least one statistical analysis. Show mull hypothesis and p-value. Draw statistical conclusion 5. Summarize biological conclusion & future work 6. Include full citations to paper & dataset

Load libraries & data

# load library
require(tidyverse)
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts --------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggrepel) # avoid overlapping labels
## Warning: package 'ggrepel' was built under R version 3.6.3
# show working directory
getwd()
## [1] "C:/Users/lai/Dropbox/Courses/bio-47120-2020"
# load data
x <- read_tsv("http://diverge.hunter.cuny.edu/~weigang/Rabinowitz-etal-data-sets/sd01.tsv")
## Parsed with column specification:
## cols(
##   id = col_character(),
##   symbol = col_character(),
##   name = col_character(),
##   Proximal = col_double(),
##   Distal = col_double(),
##   foldChange = col_double(),
##   log2FoldChange = col_double(),
##   pval = col_double(),
##   FDR = col_double()
## )
# show summaries
summary(x)
##       id               symbol              name              Proximal        
##  Length:23404       Length:23404       Length:23404       Min.   :      0.0  
##  Class :character   Class :character   Class :character   1st Qu.:     13.3  
##  Mode  :character   Mode  :character   Mode  :character   Median :     86.2  
##                                                           Mean   :    771.2  
##                                                           3rd Qu.:    375.6  
##                                                           Max.   :2545959.2  
##      Distal            foldChange        log2FoldChange           pval         
##  Min.   :      0.0   Min.   :      0.0   Min.   :-17.05302   Min.   :0.000000  
##  1st Qu.:     11.0   1st Qu.:      0.7   1st Qu.: -0.46061   1st Qu.:0.001344  
##  Median :     84.7   Median :      1.0   Median : -0.04386   Median :0.103543  
##  Mean   :    736.5   Mean   :    117.8   Mean   : -0.13567   Mean   :0.268205  
##  3rd Qu.:    388.6   3rd Qu.:      1.3   3rd Qu.:  0.34747   3rd Qu.:0.495860  
##  Max.   :1525588.9   Max.   :2407587.8   Max.   : 21.19916   Max.   :1.000000  
##       FDR          
##  Min.   :0.000000  
##  1st Qu.:0.005378  
##  Median :0.207078  
##  Mean   :0.341513  
##  3rd Qu.:0.661116  
##  Max.   :1.000000
glimpse(x)
## Observations: 23,404
## Variables: 9
## $ id             <chr> "ENSDARG00000000001", "ENSDARG00000000002", "ENSDARG...
## $ symbol         <chr> "slc35a5", "ccdc80", "nrf1", "ube2h", "slc9a3r1", "d...
## $ name           <chr> "solute carrier family 35, member A5", "coiled-coil ...
## $ Proximal       <dbl> 52.311723, 222.734126, 494.990919, 392.383616, 242.0...
## $ Distal         <dbl> 95.564719, 271.913235, 365.565522, 563.164795, 253.1...
## $ foldChange     <dbl> 1.8268160, 1.2207964, 0.7385303, 1.4352392, 1.045954...
## $ log2FoldChange <dbl> 0.86933131, 0.28782259, -0.43727102, 0.52129123, 0.0...
## $ pval           <dbl> 8.270000e-06, 3.129820e-02, 3.891670e-04, 1.510000e-...
## $ FDR            <dbl> 5.460000e-05, 7.892501e-02, 1.782398e-03, 9.490000e-...

Volcano plot

names(x)
## [1] "id"             "symbol"         "name"           "Proximal"      
## [5] "Distal"         "foldChange"     "log2FoldChange" "pval"          
## [9] "FDR"
# check data normality (histogram)
x %>% ggplot(aes(x=log2FoldChange)) + geom_histogram(bins =  100)

# apply log10 scale on p values & FDR (false discovery rate)
x %>% ggplot(aes(x=-log10(FDR))) + geom_histogram(bins =  100)
## Warning: Removed 2 rows containing non-finite values (stat_bin).

# add a column on -log10(p)
x <- x %>% mutate(log.p = -log10(pval), sig = if_else(FDR < 0.01, T, F))

# make volcano plot
p <- x %>% ggplot(aes(x=log2FoldChange, y=log.p, color = sig)) + geom_point(shape=1) + theme_bw() 

p

# pick genes known to have positional memory
pick <- c("ald1l1", "hapln1a", "agr2", "aldh1a2", "bco1", "and1", "ca2", "and2")
x.pick <- x %>% filter(symbol %in% pick) 

# add genes to volcano plot
p + geom_text(data = x.pick, aes(x=log2FoldChange, y=log.p, label=symbol), color = "red", vjust = "outward", hjust = "outward")

Heatmap

# read S4 data set
s4 <- read_tsv("http://diverge.hunter.cuny.edu/~weigang/Rabinowitz-etal-data-sets/sd04.tsv")
## Parsed with column specification:
## cols(
##   id = col_character(),
##   symbol = col_character(),
##   name = col_character(),
##   Proximal = col_double(),
##   Distal = col_double(),
##   foldChange = col_double(),
##   log2FoldChange = col_double(),
##   pval = col_double(),
##   FDR = col_double(),
##   Score = col_double(),
##   `ABS score` = col_double()
## )
# select columns and add column
x <- s4 %>% select(symbol, Proximal, Distal) %>% mutate(log.prox = log2(Proximal+1), log.dist = log2(Distal+1))

# create a matrix
x.mat <- as.matrix(data.frame(prox = x$log.prox, dist = x$log.dist))

# plot heatmap
heatmap(x.mat, Colv = NA)

Proteomics: PCA analysis

Take the following columns for Fig 1D 21-24: distal 1 25-27: distal 2 28-30: distal region 1 31-33: distal region 3 34-37: distal# 38-41: distal 4 42-45: proximal 1 46-48: proximal 2 49-51: proximal region 1 52-54: proximal region 3 55-58: proximal * 59-62: middle 3 63-66: proximal 3 67-70: ventral 71-74: center 75-78: dosal

# read dataset S13 & select columns
x <- read_tsv("http://diverge.hunter.cuny.edu/~weigang/Rabinowitz-etal-data-sets/sd13.tsv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Uniprot = col_character(),
##   `Gene Name` = col_character(),
##   `Fasta header` = col_character()
## )
## See spec(...) for full column specifications.
x.exp <- x %>% select(1,21:78)

# make a long/tiday table
x.long <- x.exp %>% gather(2:59, key = "body.position", value = "protein.amount")

# histogram to check normality
x.long %>% ggplot(aes(x=protein.amount)) + geom_histogram(bins = 100) + theme_bw()

# violin plot to show distribution by body positions
x.long %>% ggplot(aes(x=body.position, y=protein.amount)) + geom_violin() + coord_flip() + theme_bw()

# PCA analysis

## select columns (individual experiments with technical replicates) and create a matrix
x.mat <- x.exp %>% select(2:59) %>% as.matrix()

# transpose matrix (columns to rows & rows to columns)
x.mat2 <- t(x.mat) # we want to compare positions, not genes

# perform PCA
x.pca <- prcomp(x.mat2)
summary(x.pca)
## Importance of components:
##                            PC1     PC2      PC3      PC4      PC5      PC6
## Standard deviation     37.6138 26.0849 18.91785 15.24832 11.84763 11.02318
## Proportion of Variance  0.2776  0.1335  0.07023  0.04563  0.02755  0.02385
## Cumulative Proportion   0.2776  0.4112  0.48140  0.52703  0.55458  0.57842
##                            PC7    PC8     PC9    PC10    PC11    PC12    PC13
## Standard deviation     9.46031 8.9732 8.88011 8.57090 8.20800 7.64775 7.58476
## Proportion of Variance 0.01756 0.0158 0.01547 0.01442 0.01322 0.01148 0.01129
## Cumulative Proportion  0.59599 0.6118 0.62726 0.64168 0.65490 0.66638 0.67767
##                           PC14    PC15    PC16    PC17    PC18    PC19    PC20
## Standard deviation     7.46600 7.19751 7.16385 7.07330 7.01476 6.90376 6.82872
## Proportion of Variance 0.01094 0.01017 0.01007 0.00982 0.00966 0.00935 0.00915
## Cumulative Proportion  0.68861 0.69877 0.70884 0.71866 0.72832 0.73767 0.74682
##                           PC21    PC22   PC23    PC24    PC25    PC26    PC27
## Standard deviation     6.75090 6.66968 6.6212 6.57412 6.54046 6.46843 6.40096
## Proportion of Variance 0.00894 0.00873 0.0086 0.00848 0.00839 0.00821 0.00804
## Cumulative Proportion  0.75577 0.76450 0.7731 0.78158 0.78998 0.79819 0.80623
##                          PC28    PC29    PC30    PC31    PC32    PC33    PC34
## Standard deviation     6.3849 6.30711 6.26984 6.22913 6.18961 6.16581 6.07217
## Proportion of Variance 0.0080 0.00781 0.00771 0.00761 0.00752 0.00746 0.00724
## Cumulative Proportion  0.8142 0.82203 0.82975 0.83736 0.84488 0.85234 0.85958
##                           PC35    PC36    PC37    PC38    PC39    PC40    PC41
## Standard deviation     6.04587 6.00395 5.94348 5.85848 5.84834 5.79324 5.77431
## Proportion of Variance 0.00717 0.00707 0.00693 0.00674 0.00671 0.00659 0.00654
## Cumulative Proportion  0.86675 0.87383 0.88076 0.88749 0.89420 0.90079 0.90733
##                           PC42    PC43   PC44    PC45    PC46    PC47    PC48
## Standard deviation     5.75971 5.69706 5.6672 5.60399 5.59575 5.54620 5.48810
## Proportion of Variance 0.00651 0.00637 0.0063 0.00616 0.00614 0.00604 0.00591
## Cumulative Proportion  0.91384 0.92021 0.9265 0.93268 0.93882 0.94486 0.95077
##                           PC49    PC50   PC51    PC52    PC53    PC54    PC55
## Standard deviation     5.46679 5.40685 5.3911 5.36349 5.26753 5.25946 5.22093
## Proportion of Variance 0.00586 0.00574 0.0057 0.00565 0.00545 0.00543 0.00535
## Cumulative Proportion  0.95664 0.96237 0.9681 0.97372 0.97917 0.98460 0.98994
##                           PC56    PC57      PC58
## Standard deviation     5.06840 5.05464 5.894e-14
## Proportion of Variance 0.00504 0.00501 0.000e+00
## Cumulative Proportion  0.99499 1.00000 1.000e+00
plot(x.pca) # get variances distribution

# obtain coordinates
x.pca.coord <- as.data.frame(x.pca[[5]])

# transform sample names (for later use)
sample.names <- rownames(x.pca.coord)

## groups for each sample
sample.names2 <- str_remove(sample.names, "_\\d+") %>% str_remove("\\#\\d")

## groups for each position
sample.names3 <- sample.names2 %>% str_remove("_.+")

# plot 1st & 2nd principal components
x.pca.coord %>% ggplot(aes(x=PC1, y = PC2, label = sample.names)) + geom_point(shape= 1) + geom_text() +  theme_bw()

# replot with geom_text_repel
x.pca.coord %>% ggplot(aes(x=PC1, y = PC2, label = sample.names)) + geom_point(shape= 1) + geom_text_repel() +  theme_bw()

# color by body positions
x.pca.coord %>% ggplot(aes(x=PC1, y = PC2, color = sample.names3, label = sample.names3)) + geom_point(size=3) +  theme_bw()