This short tutorial will allow you to explore dplyr
functionality based on the previous lecture. Every question can be answered with a combination of %>%
pipes. You should refrain from using temporary variables and statements outside of the range of the tidyverse.
If you are missing `biomaRt, install with
source("https://bioconductor.org/biocLite.R")
biocLite("biomaRt")
library(biomaRt)
## Loading required package: methods
gene_mart <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")
gene_set <- useDataset(gene_mart , dataset="hsapiens_gene_ensembl")
gene_by_exon <-as_tibble(getBM(
mart = gene_set,
attributes = c(
"ensembl_gene_id",
"ensembl_transcript_id",
"ensembl_exon_id",
"chromosome_name",
"start_position",
"end_position",
"hgnc_symbol",
"hgnc_id",
"strand",
"gene_biotype",
"phenotype_description"
),
filter = "chromosome_name",
value = "21"
))
genes_by_exon
data set. Convert genes_by_exon
data set to a tibble
Use glimpse()
to find the correct columns and distinct()
to identify how pseudogenes are coded. Store the results in a tibble pseudogenes
gene_by_exon <- gene_by_exon %>%
as_tibble()
pseudogenes <- gene_by_exon %>%
filter(gene_biotype == "processed_pseudogene")
table()
obviously )pseudogenes %>% count()
## # A tibble: 1 × 1
## n
## <int>
## 1 170
genes
gene_by_exon %>%
dplyr::select(-ensembl_transcript_id, -ensembl_exon_id) %>%
distinct()-> genes
gene_by_exon %>%
mutate(length=end_position-start_position) %>%
arrange(length)
## # A tibble: 19,330 × 12
## ensembl_gene_id ensembl_transcript_id ensembl_exon_id chromosome_name
## <chr> <chr> <chr> <int>
## 1 ENSG00000266692 ENST00000581669 ENSE00002724833 21
## 2 ENSG00000275523 ENST00000611656 ENSE00003745719 21
## 3 ENSG00000264063 ENST00000577708 ENSE00002724720 21
## 4 ENSG00000277437 ENST00000614492 ENSE00003754026 21
## 5 ENSG00000275167 ENST00000611994 ENSE00003717901 21
## 6 ENSG00000283904 ENST00000385060 ENSE00001500066 21
## 7 ENSG00000284448 ENST00000290239 ENSE00003729123 21
## 8 ENSG00000275166 ENST00000622292 ENSE00003738277 21
## 9 ENSG00000201025 ENST00000364155 ENSE00001438918 21
## 10 ENSG00000263681 ENST00000582241 ENSE00002698395 21
## # ... with 19,320 more rows, and 8 more variables: start_position <int>,
## # end_position <int>, hgnc_symbol <chr>, hgnc_id <chr>, strand <int>,
## # gene_biotype <chr>, phenotype_description <chr>, length <int>
gene_biotype
.gene_by_exon %>%
mutate(length=end_position-start_position) %>%
group_by(gene_biotype) %>%
summarize(mean_lenght = floor(mean(length)))
## # A tibble: 19 × 2
## gene_biotype mean_lenght
## <chr> <dbl>
## 1 3prime_overlapping_ncRNA 3643
## 2 antisense 34382
## 3 IG_V_gene 435
## 4 lincRNA 105951
## 5 miRNA 86
## 6 misc_RNA 145
## 7 processed_pseudogene 1006
## 8 processed_transcript 72905
## 9 protein_coding 139095
## 10 pseudogene 103
## 11 rRNA 132
## 12 sense_intronic 12055
## 13 sense_overlapping 40989
## 14 snoRNA 118
## 15 snRNA 116
## 16 TEC 2413
## 17 transcribed_processed_pseudogene 97124
## 18 transcribed_unprocessed_pseudogene 84268
## 19 unprocessed_pseudogene 12495
gene_biotype
.genes %>%
mutate(length=end_position-start_position) %>%
group_by(gene_biotype) %>%
summarize(mean_length = floor(mean(length)),
n_gene = n_distinct(ensembl_gene_id))
## # A tibble: 19 × 3
## gene_biotype mean_length n_gene
## <chr> <dbl> <int>
## 1 3prime_overlapping_ncRNA 3643 1
## 2 antisense 16677 81
## 3 IG_V_gene 435 1
## 4 lincRNA 27550 192
## 5 miRNA 86 29
## 6 misc_RNA 145 24
## 7 processed_pseudogene 818 141
## 8 processed_transcript 83106 7
## 9 protein_coding 82332 233
## 10 pseudogene 103 1
## 11 rRNA 132 9
## 12 sense_intronic 6817 17
## 13 sense_overlapping 27095 8
## 14 snoRNA 118 15
## 15 snRNA 116 21
## 16 TEC 1938 16
## 17 transcribed_processed_pseudogene 49814 3
## 18 transcribed_unprocessed_pseudogene 65976 6
## 19 unprocessed_pseudogene 6128 32
separate
, gather
the columns and count
in a single dplyr
statement.gene_by_exon %>%
dplyr::select(phenotype_description) %>%
distinct() %>%
filter(!is.na(phenotype_description)) %>%
separate(phenotype_description , LETTERS[1:15], extra="drop", fill="right") %>%
gather(everything(), key="position", value="word") %>%
mutate(word=tolower(word)) %>%
count(word) %>%
arrange(desc(n))
## # A tibble: 207 × 2
## word n
## <chr> <int>
## 1 <NA> 1014
## 2 syndrome 22
## 3 type 16
## 4 deficiency 13
## 5 to 12
## 6 autosomal 11
## 7 early 9
## 8 cataract 8
## 9 1 7
## 10 disease 7
## # ... with 197 more rows