This exercise explore the distribution of gene lengths in the human genome
First, we load the libraries:
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Second, we set working direction & read the dataset:
setwd("../bio-714-spring/")
hg <- read.table("hg.tsv2", sep="\t", header=T)
glimpse(hg)
## Observations: 7,828
## Variables: 7
## $ Gene.ID <fct> ENSG00000198888, ENSG00000198763, ENSG00000198804,...
## $ Gene.Start <int> 3307, 4470, 5904, 7586, 8366, 8527, 9207, 10059, 1...
## $ Gene.End <int> 4262, 5511, 7445, 8269, 8572, 9207, 9990, 10404, 1...
## $ Description <fct> mitochondrially encoded NADH:ubiquinone oxidoreduc...
## $ Chromosome <fct> MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, MT...
## $ Strand <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, ...
## $ Gene.Name <fct> MT-ND1, MT-ND2, MT-CO1, MT-CO2, MT-ATP8, MT-ATP6, ...
dim(hg)
## [1] 7828 7
head(hg)
## Gene.ID Gene.Start Gene.End
## 1 ENSG00000198888 3307 4262
## 2 ENSG00000198763 4470 5511
## 3 ENSG00000198804 5904 7445
## 4 ENSG00000198712 7586 8269
## 5 ENSG00000228253 8366 8572
## 6 ENSG00000198899 8527 9207
## Description
## 1 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 1 [Source:HGNC Symbol;Acc:HGNC:7455]
## 2 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 2 [Source:HGNC Symbol;Acc:HGNC:7456]
## 3 mitochondrially encoded cytochrome c oxidase I [Source:HGNC Symbol;Acc:HGNC:7419]
## 4 mitochondrially encoded cytochrome c oxidase II [Source:HGNC Symbol;Acc:HGNC:7421]
## 5 mitochondrially encoded ATP synthase 8 [Source:HGNC Symbol;Acc:HGNC:7415]
## 6 mitochondrially encoded ATP synthase 6 [Source:HGNC Symbol;Acc:HGNC:7414]
## Chromosome Strand Gene.Name
## 1 MT 1 MT-ND1
## 2 MT 1 MT-ND2
## 3 MT 1 MT-CO1
## 4 MT 1 MT-CO2
## 5 MT 1 MT-ATP8
## 6 MT 1 MT-ATP6
tail(hg)
## Gene.ID Gene.Start Gene.End
## 7823 ENSG00000100246 38778508 38794198
## 7824 ENSG00000100034 21919420 21952837
## 7825 ENSG00000105248 4247079 4269090
## 7826 ENSG00000128973 68206992 68257211
## 7827 ENSG00000151655 7703269 7749520
## 7828 ENSG00000141338 68867292 68955392
## Description
## 7823 dynein, axonemal, light chain 4 [Source:HGNC Symbol;Acc:HGNC:2955]
## 7824 protein phosphatase, Mg2+/Mn2+ dependent 1F [Source:HGNC Symbol;Acc:HGNC:19388]
## 7825 coiled-coil domain containing 94 [Source:HGNC Symbol;Acc:HGNC:25518]
## 7826 ceroid-lipofuscinosis, neuronal 6, late infantile, variant [Source:HGNC Symbol;Acc:HGNC:2077]
## 7827 inter-alpha-trypsin inhibitor heavy chain 2 [Source:HGNC Symbol;Acc:HGNC:6167]
## 7828 ATP binding cassette subfamily A member 8 [Source:HGNC Symbol;Acc:HGNC:38]
## Chromosome Strand Gene.Name
## 7823 22 -1 DNAL4
## 7824 22 -1 PPM1F
## 7825 19 1 CCDC94
## 7826 15 -1 CLN6
## 7827 10 1 ITIH2
## 7828 17 -1 ABCA8
class(hg)
## [1] "data.frame"
Third, we plot/visualize the data
# add a column
hg2 <- mutate(hg, Gene.length = Gene.End - Gene.Start + 1)
head(hg2)
## Gene.ID Gene.Start Gene.End
## 1 ENSG00000198888 3307 4262
## 2 ENSG00000198763 4470 5511
## 3 ENSG00000198804 5904 7445
## 4 ENSG00000198712 7586 8269
## 5 ENSG00000228253 8366 8572
## 6 ENSG00000198899 8527 9207
## Description
## 1 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 1 [Source:HGNC Symbol;Acc:HGNC:7455]
## 2 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 2 [Source:HGNC Symbol;Acc:HGNC:7456]
## 3 mitochondrially encoded cytochrome c oxidase I [Source:HGNC Symbol;Acc:HGNC:7419]
## 4 mitochondrially encoded cytochrome c oxidase II [Source:HGNC Symbol;Acc:HGNC:7421]
## 5 mitochondrially encoded ATP synthase 8 [Source:HGNC Symbol;Acc:HGNC:7415]
## 6 mitochondrially encoded ATP synthase 6 [Source:HGNC Symbol;Acc:HGNC:7414]
## Chromosome Strand Gene.Name Gene.length
## 1 MT 1 MT-ND1 956
## 2 MT 1 MT-ND2 1042
## 3 MT 1 MT-CO1 1542
## 4 MT 1 MT-CO2 684
## 5 MT 1 MT-ATP8 207
## 6 MT 1 MT-ATP6 681
# filter genes on mitochondria:
hg.mt <- filter(hg, Chromosome == 'MT')
dim(hg.mt)
## [1] 13 7
# filter long genes:
hg.long <- filter(hg2, Gene.length > 1e6)
hg.long
## Gene.ID Gene.Start Gene.End
## 1 ENSG00000175497 114442299 115845752
## 2 ENSG00000164796 112222928 113437099
## 3 ENSG00000196090 42072752 43189970
## 4 ENSG00000185565 115802363 117139389
## 5 ENSG00000172264 13995369 16053197
## 6 ENSG00000149972 99020953 100358885
## 7 ENSG00000185046 98726457 99984654
## 8 ENSG00000183117 2935353 4994972
## 9 ENSG00000155093 157539056 158587788
## 10 ENSG00000186094 48532855 50023913
## 11 ENSG00000153707 8314246 10612723
## 12 ENSG00000169946 104590733 105804532
## 13 ENSG00000174469 146116002 148420998
## 14 ENSG00000113448 58969038 60522120
## 15 ENSG00000189283 59749310 61251459
## 16 ENSG00000152208 92303622 93774556
## 17 ENSG00000158321 69598919 70793068
## 18 ENSG00000144451 213284379 214410501
## 19 ENSG00000189108 104566315 105767829
## 20 ENSG00000148948 40114203 41459773
## 21 ENSG00000183098 93226842 94407401
## Description
## 1 dipeptidyl-peptidase 10 (inactive) [Source:HGNC Symbol;Acc:HGNC:20823]
## 2 CUB and Sushi multiple domains 3 [Source:HGNC Symbol;Acc:HGNC:19291]
## 3 protein tyrosine phosphatase, receptor type, T [Source:HGNC Symbol;Acc:HGNC:9682]
## 4 limbic system-associated membrane protein [Source:HGNC Symbol;Acc:HGNC:6705]
## 5 MACRO domain containing 2 [Source:HGNC Symbol;Acc:HGNC:16126]
## 6 contactin 5 [Source:HGNC Symbol;Acc:HGNC:2175]
## 7 ankyrin repeat and sterile alpha motif domain containing 1B [Source:HGNC Symbol;Acc:HGNC:24600]
## 8 CUB and Sushi multiple domains 1 [Source:HGNC Symbol;Acc:HGNC:14026]
## 9 protein tyrosine phosphatase, receptor type, N polypeptide 2 [Source:HGNC Symbol;Acc:HGNC:9677]
## 10 ATP/GTP binding protein-like 4 [Source:HGNC Symbol;Acc:HGNC:25892]
## 11 protein tyrosine phosphatase, receptor type, D [Source:HGNC Symbol;Acc:HGNC:9668]
## 12 zinc finger protein, FOG family member 2 [Source:HGNC Symbol;Acc:HGNC:16700]
## 13 contactin associated protein-like 2 [Source:HGNC Symbol;Acc:HGNC:13830]
## 14 phosphodiesterase 4D [Source:HGNC Symbol;Acc:HGNC:8783]
## 15 fragile histidine triad [Source:HGNC Symbol;Acc:HGNC:3701]
## 16 glutamate receptor, ionotropic, delta 2 [Source:HGNC Symbol;Acc:HGNC:4576]
## 17 autism susceptibility candidate 2 [Source:HGNC Symbol;Acc:HGNC:14262]
## 18 sperm associated antigen 16 [Source:HGNC Symbol;Acc:HGNC:23225]
## 19 interleukin 1 receptor accessory protein-like 2 [Source:HGNC Symbol;Acc:HGNC:5997]
## 20 leucine rich repeat containing 4C [Source:HGNC Symbol;Acc:HGNC:29317]
## 21 glypican 6 [Source:HGNC Symbol;Acc:HGNC:4454]
## Chromosome Strand Gene.Name Gene.length
## 1 2 1 DPP10 1403454
## 2 8 -1 CSMD3 1214172
## 3 20 -1 PTPRT 1117219
## 4 3 -1 LSAMP 1337027
## 5 20 1 MACROD2 2057829
## 6 11 1 CNTN5 1337933
## 7 12 -1 ANKS1B 1258198
## 8 8 -1 CSMD1 2059620
## 9 7 -1 PTPRN2 1048733
## 10 1 -1 AGBL4 1491059
## 11 9 -1 PTPRD 2298478
## 12 8 1 ZFPM2 1213800
## 13 7 1 CNTNAP2 2304997
## 14 5 -1 PDE4D 1553083
## 15 3 -1 FHIT 1502150
## 16 4 1 GRID2 1470935
## 17 7 1 AUTS2 1194150
## 18 2 1 SPAG16 1126123
## 19 X 1 IL1RAPL2 1201515
## 20 11 -1 LRRC4C 1345571
## 21 13 1 GPC6 1180560
# select columns
hg3 <- select(hg2, -4)
head(hg3)
## Gene.ID Gene.Start Gene.End Chromosome Strand Gene.Name
## 1 ENSG00000198888 3307 4262 MT 1 MT-ND1
## 2 ENSG00000198763 4470 5511 MT 1 MT-ND2
## 3 ENSG00000198804 5904 7445 MT 1 MT-CO1
## 4 ENSG00000198712 7586 8269 MT 1 MT-CO2
## 5 ENSG00000228253 8366 8572 MT 1 MT-ATP8
## 6 ENSG00000198899 8527 9207 MT 1 MT-ATP6
## Gene.length
## 1 956
## 2 1042
## 3 1542
## 4 684
## 5 207
## 6 681
# plot overall distribution
ggplot(data=hg3, aes(x=Gene.length)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# add log scale
ggplot(data=hg3, aes(x=Gene.length)) + geom_histogram() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# plot length distribution by chromosome
ggplot(data=hg3, aes(x=Gene.length)) + geom_histogram() + scale_x_log10() + facet_wrap(~Chromosome)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# boxplot
ggplot(data=hg3, aes(x=Chromosome, y=Gene.length)) + geom_boxplot() + scale_y_log10()
# reorder by median, add color, customize labels
ggplot(data=hg3, aes(x=reorder(Chromosome, Gene.length, FUN=median), y=Gene.length)) + geom_boxplot(aes(fill=Chromosome)) + scale_y_log10() + xlab("Chromosome") + ylab("Gene length (bp)")