This exercise explore the distribution of gene lengths in the human genome

First, we load the libraries:

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Second, we set working direction & read the dataset:

setwd("../bio-714-spring/")
hg <- read.table("hg.tsv2", sep="\t", header=T)
glimpse(hg)
## Observations: 7,828
## Variables: 7
## $ Gene.ID     <fct> ENSG00000198888, ENSG00000198763, ENSG00000198804,...
## $ Gene.Start  <int> 3307, 4470, 5904, 7586, 8366, 8527, 9207, 10059, 1...
## $ Gene.End    <int> 4262, 5511, 7445, 8269, 8572, 9207, 9990, 10404, 1...
## $ Description <fct> mitochondrially encoded NADH:ubiquinone oxidoreduc...
## $ Chromosome  <fct> MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, MT...
## $ Strand      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, ...
## $ Gene.Name   <fct> MT-ND1, MT-ND2, MT-CO1, MT-CO2, MT-ATP8, MT-ATP6, ...
dim(hg)
## [1] 7828    7
head(hg)
##           Gene.ID Gene.Start Gene.End
## 1 ENSG00000198888       3307     4262
## 2 ENSG00000198763       4470     5511
## 3 ENSG00000198804       5904     7445
## 4 ENSG00000198712       7586     8269
## 5 ENSG00000228253       8366     8572
## 6 ENSG00000198899       8527     9207
##                                                                                                Description
## 1 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 1 [Source:HGNC Symbol;Acc:HGNC:7455]
## 2 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 2 [Source:HGNC Symbol;Acc:HGNC:7456]
## 3                        mitochondrially encoded cytochrome c oxidase I [Source:HGNC Symbol;Acc:HGNC:7419]
## 4                       mitochondrially encoded cytochrome c oxidase II [Source:HGNC Symbol;Acc:HGNC:7421]
## 5                                mitochondrially encoded ATP synthase 8 [Source:HGNC Symbol;Acc:HGNC:7415]
## 6                                mitochondrially encoded ATP synthase 6 [Source:HGNC Symbol;Acc:HGNC:7414]
##   Chromosome Strand Gene.Name
## 1         MT      1    MT-ND1
## 2         MT      1    MT-ND2
## 3         MT      1    MT-CO1
## 4         MT      1    MT-CO2
## 5         MT      1   MT-ATP8
## 6         MT      1   MT-ATP6
tail(hg)
##              Gene.ID Gene.Start Gene.End
## 7823 ENSG00000100246   38778508 38794198
## 7824 ENSG00000100034   21919420 21952837
## 7825 ENSG00000105248    4247079  4269090
## 7826 ENSG00000128973   68206992 68257211
## 7827 ENSG00000151655    7703269  7749520
## 7828 ENSG00000141338   68867292 68955392
##                                                                                        Description
## 7823                            dynein, axonemal, light chain 4 [Source:HGNC Symbol;Acc:HGNC:2955]
## 7824               protein phosphatase, Mg2+/Mn2+ dependent 1F [Source:HGNC Symbol;Acc:HGNC:19388]
## 7825                          coiled-coil domain containing 94 [Source:HGNC Symbol;Acc:HGNC:25518]
## 7826 ceroid-lipofuscinosis, neuronal 6, late infantile, variant [Source:HGNC Symbol;Acc:HGNC:2077]
## 7827                inter-alpha-trypsin inhibitor heavy chain 2 [Source:HGNC Symbol;Acc:HGNC:6167]
## 7828                    ATP binding cassette subfamily A member 8 [Source:HGNC Symbol;Acc:HGNC:38]
##      Chromosome Strand Gene.Name
## 7823         22     -1     DNAL4
## 7824         22     -1     PPM1F
## 7825         19      1    CCDC94
## 7826         15     -1      CLN6
## 7827         10      1     ITIH2
## 7828         17     -1     ABCA8
class(hg)
## [1] "data.frame"

Third, we plot/visualize the data

# add a column
hg2 <- mutate(hg, Gene.length = Gene.End - Gene.Start + 1)
head(hg2)
##           Gene.ID Gene.Start Gene.End
## 1 ENSG00000198888       3307     4262
## 2 ENSG00000198763       4470     5511
## 3 ENSG00000198804       5904     7445
## 4 ENSG00000198712       7586     8269
## 5 ENSG00000228253       8366     8572
## 6 ENSG00000198899       8527     9207
##                                                                                                Description
## 1 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 1 [Source:HGNC Symbol;Acc:HGNC:7455]
## 2 mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 2 [Source:HGNC Symbol;Acc:HGNC:7456]
## 3                        mitochondrially encoded cytochrome c oxidase I [Source:HGNC Symbol;Acc:HGNC:7419]
## 4                       mitochondrially encoded cytochrome c oxidase II [Source:HGNC Symbol;Acc:HGNC:7421]
## 5                                mitochondrially encoded ATP synthase 8 [Source:HGNC Symbol;Acc:HGNC:7415]
## 6                                mitochondrially encoded ATP synthase 6 [Source:HGNC Symbol;Acc:HGNC:7414]
##   Chromosome Strand Gene.Name Gene.length
## 1         MT      1    MT-ND1         956
## 2         MT      1    MT-ND2        1042
## 3         MT      1    MT-CO1        1542
## 4         MT      1    MT-CO2         684
## 5         MT      1   MT-ATP8         207
## 6         MT      1   MT-ATP6         681
# filter genes on mitochondria:
hg.mt <- filter(hg, Chromosome == 'MT')
dim(hg.mt)
## [1] 13  7
# filter long genes:
hg.long <- filter(hg2, Gene.length > 1e6)
hg.long
##            Gene.ID Gene.Start  Gene.End
## 1  ENSG00000175497  114442299 115845752
## 2  ENSG00000164796  112222928 113437099
## 3  ENSG00000196090   42072752  43189970
## 4  ENSG00000185565  115802363 117139389
## 5  ENSG00000172264   13995369  16053197
## 6  ENSG00000149972   99020953 100358885
## 7  ENSG00000185046   98726457  99984654
## 8  ENSG00000183117    2935353   4994972
## 9  ENSG00000155093  157539056 158587788
## 10 ENSG00000186094   48532855  50023913
## 11 ENSG00000153707    8314246  10612723
## 12 ENSG00000169946  104590733 105804532
## 13 ENSG00000174469  146116002 148420998
## 14 ENSG00000113448   58969038  60522120
## 15 ENSG00000189283   59749310  61251459
## 16 ENSG00000152208   92303622  93774556
## 17 ENSG00000158321   69598919  70793068
## 18 ENSG00000144451  213284379 214410501
## 19 ENSG00000189108  104566315 105767829
## 20 ENSG00000148948   40114203  41459773
## 21 ENSG00000183098   93226842  94407401
##                                                                                        Description
## 1                           dipeptidyl-peptidase 10 (inactive) [Source:HGNC Symbol;Acc:HGNC:20823]
## 2                             CUB and Sushi multiple domains 3 [Source:HGNC Symbol;Acc:HGNC:19291]
## 3                protein tyrosine phosphatase, receptor type, T [Source:HGNC Symbol;Acc:HGNC:9682]
## 4                     limbic system-associated membrane protein [Source:HGNC Symbol;Acc:HGNC:6705]
## 5                                    MACRO domain containing 2 [Source:HGNC Symbol;Acc:HGNC:16126]
## 6                                                   contactin 5 [Source:HGNC Symbol;Acc:HGNC:2175]
## 7  ankyrin repeat and sterile alpha motif domain containing 1B [Source:HGNC Symbol;Acc:HGNC:24600]
## 8                             CUB and Sushi multiple domains 1 [Source:HGNC Symbol;Acc:HGNC:14026]
## 9  protein tyrosine phosphatase, receptor type, N polypeptide 2 [Source:HGNC Symbol;Acc:HGNC:9677]
## 10                              ATP/GTP binding protein-like 4 [Source:HGNC Symbol;Acc:HGNC:25892]
## 11               protein tyrosine phosphatase, receptor type, D [Source:HGNC Symbol;Acc:HGNC:9668]
## 12                    zinc finger protein, FOG family member 2 [Source:HGNC Symbol;Acc:HGNC:16700]
## 13                         contactin associated protein-like 2 [Source:HGNC Symbol;Acc:HGNC:13830]
## 14                                         phosphodiesterase 4D [Source:HGNC Symbol;Acc:HGNC:8783]
## 15                                      fragile histidine triad [Source:HGNC Symbol;Acc:HGNC:3701]
## 16                      glutamate receptor, ionotropic, delta 2 [Source:HGNC Symbol;Acc:HGNC:4576]
## 17                           autism susceptibility candidate 2 [Source:HGNC Symbol;Acc:HGNC:14262]
## 18                                 sperm associated antigen 16 [Source:HGNC Symbol;Acc:HGNC:23225]
## 19              interleukin 1 receptor accessory protein-like 2 [Source:HGNC Symbol;Acc:HGNC:5997]
## 20                           leucine rich repeat containing 4C [Source:HGNC Symbol;Acc:HGNC:29317]
## 21                                                   glypican 6 [Source:HGNC Symbol;Acc:HGNC:4454]
##    Chromosome Strand Gene.Name Gene.length
## 1           2      1     DPP10     1403454
## 2           8     -1     CSMD3     1214172
## 3          20     -1     PTPRT     1117219
## 4           3     -1     LSAMP     1337027
## 5          20      1   MACROD2     2057829
## 6          11      1     CNTN5     1337933
## 7          12     -1    ANKS1B     1258198
## 8           8     -1     CSMD1     2059620
## 9           7     -1    PTPRN2     1048733
## 10          1     -1     AGBL4     1491059
## 11          9     -1     PTPRD     2298478
## 12          8      1     ZFPM2     1213800
## 13          7      1   CNTNAP2     2304997
## 14          5     -1     PDE4D     1553083
## 15          3     -1      FHIT     1502150
## 16          4      1     GRID2     1470935
## 17          7      1     AUTS2     1194150
## 18          2      1    SPAG16     1126123
## 19          X      1  IL1RAPL2     1201515
## 20         11     -1    LRRC4C     1345571
## 21         13      1      GPC6     1180560
# select columns
hg3 <- select(hg2, -4)
head(hg3)
##           Gene.ID Gene.Start Gene.End Chromosome Strand Gene.Name
## 1 ENSG00000198888       3307     4262         MT      1    MT-ND1
## 2 ENSG00000198763       4470     5511         MT      1    MT-ND2
## 3 ENSG00000198804       5904     7445         MT      1    MT-CO1
## 4 ENSG00000198712       7586     8269         MT      1    MT-CO2
## 5 ENSG00000228253       8366     8572         MT      1   MT-ATP8
## 6 ENSG00000198899       8527     9207         MT      1   MT-ATP6
##   Gene.length
## 1         956
## 2        1042
## 3        1542
## 4         684
## 5         207
## 6         681
# plot overall distribution
ggplot(data=hg3, aes(x=Gene.length)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# add log scale
ggplot(data=hg3, aes(x=Gene.length)) + geom_histogram() + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# plot length distribution by chromosome
ggplot(data=hg3, aes(x=Gene.length)) + geom_histogram() + scale_x_log10() + facet_wrap(~Chromosome)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# boxplot
ggplot(data=hg3, aes(x=Chromosome, y=Gene.length)) + geom_boxplot() + scale_y_log10()

# reorder by median, add color, customize labels
ggplot(data=hg3, aes(x=reorder(Chromosome, Gene.length, FUN=median), y=Gene.length)) + geom_boxplot(aes(fill=Chromosome)) + scale_y_log10() + xlab("Chromosome") + ylab("Gene length (bp)")