Bioconductor Annotation Resources

Introduction

In this demonstration, we will explore several powerful Bioconductor packages used for genomic annotation: * OrgDb: Gene-centric annotations for a specific organism. * TxDb: Transcript-centric annotations (genomic coordinates). * biomaRt: R interface to query Ensembl and other BioMart databases. * AnnotationHub & ExperimentHub: Cloud-based repositories for standard annotations and curated experimental datasets.

Setup and Libraries

First, we load the necessary packages. If you don’t have these installed, you will need to use BiocManager::install().

library(org.Hs.eg.db)                         # Human gene annotations
library(TxDb.Hsapiens.UCSC.hg38.knownGene)    # Human transcript models (hg38)
library(biomaRt)                              # Web-based database queries
# library(KEGGREST)                             # API for KEGG pathways
library(AnnotationHub)                        # Hub for annotation objects
library(GenomicRanges)                        # Core structure for genomic intervals
library(ExperimentHub)                        # Hub for experimental datasets
library(SummarizedExperiment)                 # Core container for assay data
library(dplyr)                                # Data manipulation

OrgDb: Organism-Level Annotations

OrgDb packages provide gene-centric annotations (like mapping Ensembl IDs to Gene Symbols) for specific organisms. The org.Hs.eg.db package is specifically for Homo sapiens.

# View package documentation
# ?org.Hs.eg.db

# See what types of data (columns) we can extract from this database
columns(org.Hs.eg.db)

##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GENETYPE"     "GO"           "GOALL"        "IPI"          "MAP"         
## [16] "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"        
## [21] "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [26] "UNIPROT"

# Map between different identifier types
# Here, we map HGNC Gene Symbols (BRCA1, BRCA2) to their corresponding Ensembl IDs.
mapIds(
  x = org.Hs.eg.db, 
  keys = c("BRCA1", "BRCA2"), # The IDs we are querying
  column = "ENSEMBL",         # The type of ID we want back
  keytype = "SYMBOL",         # The type of ID we are passing in
  multiVals = "first"         # What to do if there are multiple matches
)

##             BRCA1             BRCA2 
## "ENSG00000012048" "ENSG00000139618"

TxDb: Transcript-Level Annotations

TxDb packages store genomic coordinates for transcripts, exons, and CDS (coding sequences). Here we use the UCSC hg38 genome build.

# Extract all exons from the database as a GRanges object
all_exons <- exons(TxDb.Hsapiens.UCSC.hg38.knownGene)
head(all_exons, 3)

## GRanges object with 3 ranges and 1 metadata column:
##       seqnames      ranges strand |   exon_id
##          <Rle>   <IRanges>  <Rle> | <integer>
##   [1]     chr1 11121-11211      + |         1
##   [2]     chr1 11125-11211      + |         2
##   [3]     chr1 11410-11671      + |         3
##   -------
##   seqinfo: 711 sequences (1 circular) from hg38 genome

# Group exons by their corresponding gene
# This returns a GRangesList where each list element is a gene, 
# and the GRanges inside are the exons for that gene.
exons_by_gene <- exonsBy(TxDb.Hsapiens.UCSC.hg38.knownGene, by = "gene")
head(exons_by_gene, 2)

## GRangesList object of length 2:
## $`1`
## GRanges object with 29 ranges and 2 metadata columns:
##        seqnames            ranges strand |   exon_id   exon_name
##           <Rle>         <IRanges>  <Rle> | <integer> <character>
##    [1]    chr19 58345178-58347029      - |    812668        <NA>
##    [2]    chr19 58345183-58347029      - |    812669        <NA>
##    [3]    chr19 58346854-58347029      - |    812670        <NA>
##    [4]    chr19 58346858-58347029      - |    812671        <NA>
##    [5]    chr19 58346860-58347029      - |    812672        <NA>
##    ...      ...               ...    ... .       ...         ...
##   [25]    chr19 58357585-58357649      - |    812695        <NA>
##   [26]    chr19 58357952-58358286      - |    812696        <NA>
##   [27]    chr19 58358489-58358585      - |    812697        <NA>
##   [28]    chr19 58359197-58359323      - |    812698        <NA>
##   [29]    chr19 58362677-58362751      - |    812699        <NA>
##   -------
##   seqinfo: 711 sequences (1 circular) from hg38 genome
## 
## $`10`
## GRanges object with 12 ranges and 2 metadata columns:
##        seqnames            ranges strand |   exon_id   exon_name
##           <Rle>         <IRanges>  <Rle> | <integer> <character>
##    [1]     chr8 18386273-18386390      + |    381744        <NA>
##    [2]     chr8 18386315-18386390      + |    381745        <NA>
##    [3]     chr8 18386348-18386563      + |    381746        <NA>
##    [4]     chr8 18386783-18387036      + |    381749        <NA>
##    [5]     chr8 18386827-18386910      + |    381750        <NA>
##    ...      ...               ...    ... .       ...         ...
##    [8]     chr8 18388065-18388323      + |    381754        <NA>
##    [9]     chr8 18391282-18391345      + |    381756        <NA>
##   [10]     chr8 18391287-18391345      + |    381757        <NA>
##   [11]     chr8 18399998-18401218      + |    381758        <NA>
##   [12]     chr8 18400337-18400993      + |    381759        <NA>
##   -------
##   seqinfo: 711 sequences (1 circular) from hg38 genome

biomaRt: Querying Web Databases

biomaRt allows us to query major databases (like Ensembl) directly from R. It works using three main concepts: 1. Mart: The database you want to connect to. 2. Filters: The input criteria (e.g., “Find genes on chromosome 21”). 3. Attributes: The information you want retrieved (e.g., “Give me their gene symbols”).

# 1. Discover available marts and datasets
head(listMarts(), 7)                               # List available databases
head(listDatasets(useMart("ensembl")), 7)          # List datasets within Ensembl

# 2. Connect to the fully specified mart (Human Ensembl)
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

Now let’s perform some queries using our ensembl connection.

# Query 1: Find Gene symbols associated with a specific GO term (MAP kinase activity)
# Note: GO:0004707 is MAP kinase activity; GO:0004704 is a related parent term.
go_query <- getBM(
  attributes = c('entrezgene_id', 'hgnc_symbol'), # Note: 'entrezgene' is deprecated in newer versions
  filters = 'go', 
  values = 'GO:0004707',
  mart = ensembl
)
head(go_query, 5)

# ---------------------------------------------------------
# Exploring available filters and attributes
# ---------------------------------------------------------

# Look at available filters and search for ones related to "entrez"
head(listFilters(ensembl), 7)
all_filters <- listFilters(ensembl)
all_filters[grep("entrez", all_filters[, "description"], ignore.case = TRUE), ]

# Look at the specific options for a particular filter (e.g., chromosome_name)
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) # Truncated for easy viewing

# Look at available attributes (what we can ask the database to return)
head(listAttributes(ensembl), 3)

# ---------------------------------------------------------
# Assemble and execute a custom query
# ---------------------------------------------------------
myAttributes <- c("ensembl_gene_id", "chromosome_name")
myValues <- c("21", "22") # Look on chromosomes 21 and 22

res <- getBM(
  attributes = myAttributes, 
  filters = myFilter,
  values = myValues, 
  mart = ensembl
)

# View a summary of the results
table(res$chromosome_name)

AnnotationHub: Cloud-Based Annotations

AnnotationHub is a massive cloud repository of standard annotation files (BED, GTF, GRanges, etc.). Instead of downloading and parsing flat files, you retrieve them as ready-to-use R objects.

# Initialize the hub
ah <- AnnotationHub()

# See who provides data to the hub
head(unique(ah$dataprovider))

## [1] "UCSC"        "Ensembl"     "RefNet"      "Inparanoid8" "NHLBI"      
## [6] "ChEA"

# Subset the hub to only include Human data to make it manageable
ah <- subset(ah, species == "Homo sapiens")
ah

## AnnotationHub with 26647 records
## # snapshotDate(): 2025-04-08
## # $dataprovider: BroadInstitute, UCSC, Ensembl, GENCODE, Google DeepMind, UW...
## # $species: Homo sapiens
## # $rdataclass: GRanges, BigWigFile, Rle, ChainFile, TwoBitFile, TxDb, list, ...
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
## #   rdatapath, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH5012"]]' 
## 
##              title                                   
##   AH5012   | Chromosome Band                         
##   AH5013   | STS Markers                             
##   AH5014   | FISH Clones                             
##   AH5015   | Recomb Rate                             
##   AH5016   | ENCODE Pilot                            
##   ...        ...                                     
##   AH119513 | T2T.GreyListChIP.STAR_101bp_1000merge   
##   AH119529 | org.Hs.eg.db.sqlite                     
##   AH119538 | TxDb.Hsapiens.UCSC.hg38.knownGene.sqlite
##   AH121554 | LRBaseDb for Homo sapiens (Human, v009) 
##   AH121659 | MeSHDb for Homo sapiens (Human, v009)

Let’s look for specific data, like UCSC tracks for hg19.

# Query the hub for specific terms
ah_query <- query(ah, c("ucsc", "hg19"))

# Quick look at the metadata fields available
head(ah_query$title, 3)

## [1] "Chromosome Band" "STS Markers"     "FISH Clones"

head(ah_query$tags, 2)

## [1] "cytoBand, UCSC, track, Gene, Transcript, Annotation"
## [2] "stsMap, UCSC, track, Gene, Transcript, Annotation"

# See the most common file types available in our subset
ah_query$description %>% table() %>% sort(decreasing = TRUE) %>% head()

## .
##                               narrowPeak file from ENCODE 
##                                                      2020 
##                                broadPeak file from ENCODE 
##                                                      1976 
##                                      gtf file from ENCODE 
##                                                       892 
##                           bedRnaElements file from ENCODE 
##                                                       393 
##                 phastCons scores for Homo sapiens on chr1 
##                                                         3 
## phastCons scores for Homo sapiens on chr1_gl000191_random 
##                                                         3

Let’s retrieve an actual dataset from the hub. Note: In practice, it’s safer to extract objects using their AH ID (e.g., “AH5012”) rather than their index number, as hub contents can update.

# Extracting a specific item (CpG Islands) using its index from our query
ah_query$title[75]

## [1] "CpG Islands"

cpg_granges <- ah_query[[75]] 
summary(width(cpg_granges))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   201.0   323.0   559.0   761.3   942.0 45712.0

# reduce() merges overlapping genomic ranges
summary(width(reduce(cpg_granges)))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   201.0   323.0   559.0   761.3   942.0 45712.0

# Extracting an item using its exact AH ID (Chromosome Bands)
ah_query$title[ah_query$ah_id == "AH5012"]

## [1] "Chromosome Band"

chromosome_bands <- ah_query[["AH5012"]]
head(chromosome_bands, 3)

## UCSC track 'cytoBand'
## UCSCData object with 3 ranges and 1 metadata column:
##       seqnames          ranges strand |        name
##          <Rle>       <IRanges>  <Rle> | <character>
##   [1]     chr1       1-2300000      * |      p36.33
##   [2]     chr1 2300001-5400000      * |      p36.32
##   [3]     chr1 5400001-7200000      * |      p36.31
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

# View the metadata attached to this object
metadata(chromosome_bands)

## $AnnotationHubName
## [1] "AH5012"
## 
## $`File Name`
## [1] "cytoBand"
## 
## $`Data Source`
## [1] "rtracklayer://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/cytoBand"
## 
## $Provider
## [1] "UCSC"
## 
## $Organism
## [1] "Homo sapiens"
## 
## $`Taxonomy ID`
## [1] 9606

ExperimentHub: Curated Datasets

While AnnotationHub provides reference annotations, ExperimentHub provides curated, analysis-ready datasets from publications and large consortia (like TCGA).

# Initialize the hub
eh <- ExperimentHub()

# Query for datasets related to The Cancer Genome Atlas (TCGA)
tcga_data <- query(eh, "TCGA")

# View the first few matching datasets
head(tcga_data, 5)

## ExperimentHub with 5 records
## # snapshotDate(): 2025-04-12
## # $dataprovider: Eli and Edythe L. Broad Institute of Harvard and MIT, GEO
## # $species: Homo sapiens
## # $rdataclass: RaggedExperiment, SummarizedExperiment, ExpressionSet, DataFrame
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
## #   rdatapath, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["EH1"]]' 
## 
##           title                                                              
##   EH1   | RNA-Sequencing and clinical data for 7706 tumor samples from The...
##   EH558 | ACC_CNASNP-20160128                                                
##   EH559 | ACC_CNVSNP-20160128                                                
##   EH560 | ACC_colData-20160128                                               
##   EH561 | ACC_GISTIC_AllByGene-20160128