GenomicRanges and Related Packages Demo

Introduction

This document demonstrates the core concepts of working with genomic intervals in Bioconductor. It covers foundational integer ranges (IRanges), base-level run-length encoding (Rle), full genomic ranges with chromosome and strand information (GenomicRanges), and retrieving transcript annotations (GenomicFeatures).

Introduction to IRanges

The IRanges package provides the foundation for representing sequences of integers. This chunk shows how to create basic interval objects using start/end coordinates or start/width combinations, and how to access their properties.

#####################################################
## first on IRanges
#####################################################
library(IRanges)

# Create an IRanges object by specifying start and end points
r <- IRanges(start=c(1,3,12, 10), end=c(4, 5, 25, 19))
r

## IRanges object with 4 ranges and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         1         4         4
##   [2]         3         5         3
##   [3]        12        25        14
##   [4]        10        19        10

# Alternatively, create the exact same IRanges object using start and width
r <- IRanges(start=c(1,3,12, 10), width=c(4,  3, 14, 10))
r

## IRanges object with 4 ranges and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         1         4         4
##   [2]         3         5         3
##   [3]        12        25        14
##   [4]        10        19        10

# Accessor functions to extract properties of the IRanges object
length(r) # Number of intervals in the object

## [1] 4

start(r)  # Vector of start positions

## [1]  1  3 12 10

end(r)    # Vector of end positions

## [1]  4  5 25 19

width(r)  # Vector of interval widths

## [1]  4  3 14 10

# Subset the IRanges object to keep only the first two intervals
r[1:2]

## IRanges object with 2 ranges and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         1         4         4
##   [2]         3         5         3

Visualizing Ranges

Because intervals can be abstract, this custom plotting function helps visualize them as staggered rectangles on a plot. We will use this to visualize the operations in the next chunk.

# A small visualization function to plot intervals
plotRanges <- function(x, xlim = x, main = deparse(substitute(x)), col = "black", sep = 0.5, ...) {
  height <- 1
  # Determine the x-axis limits based on the min/max of the ranges
  if (is(xlim, "Ranges"))
    xlim <- c(min(start(xlim)), max(end(xlim)))
  
  # Calculate staggered bins so overlapping ranges don't plot on top of each other
  bins <- disjointBins(IRanges(start(x), end(x)+1))
  
  # Initialize the plot canvas
  plot.new()
  plot.window(xlim, c(0, max(bins)*(height+sep)))
  
  # Calculate y-coordinates for drawing the rectangles
  ybottom <- bins * (sep + height) - height
  
  # Draw the ranges as rectangles
  rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom+ height, col = col, ...)
  
  # Add title and x-axis
  title(main)
  axis(1)
}

Single IRanges Operations

This chunk demonstrates intra-range operations (transforming existing ranges) and inter-range operations (comparing ranges within a single object to each other), plotting the results.

# Set up plotting parameters for stacked plots
par(mar = c(2.5, 2.5, 2.6, 1.1), mgp = c(5.5, 1.5, 0), mfrow=c(2,1))

# Compare original ranges vs. reduced ranges
plotRanges(r, xlim=c(0,25), col="grey")
# reduce() merges all overlapping or adjacent ranges into a single continuous range
plotRanges(reduce(r), xlim=c(0,25), col="grey")

par(mar = c(2.5, 2.5, 2.6, 1.1), mgp = c(5.5, 1.5, 0), mfrow=c(2,1))
plotRanges(r, xlim=c(0,25), col="grey")
# disjoin() breaks ranges into the largest set of non-overlapping contiguous intervals
plotRanges(disjoin(r), xlim=c(0,25), col="grey")

par(mar = c(2.5, 2.5, 2.6, 1.1), mgp = c(5.5, 1.5, 0), mfrow=c(3,1))
plotRanges(r, xlim=c(-2,27), col="grey")
# flank() creates new ranges of width 1 adjacent to the starts of the original ranges (upstream/downstream)
plotRanges(flank(r, 1, both=TRUE, start=TRUE), xlim=c(-2,27), col="grey")
plotRanges(flank(r, 1, both=TRUE, start=FALSE), xlim=c(-2,27), col="grey")

par(mar = c(2.5, 2.5, 2.6, 1.1), mgp = c(5.5, 1.5, 0), mfrow=c(1,1))
plotRanges(r, xlim=c(-2,27), col="grey")
# coverage() calculates exactly how many ranges overlap at every single integer position
lines(coverage(r), lwd=4, col="red")

Operations Between Two IRanges

Here we explore operations between two different sets of ranges, calculating standard set theory operations (union, intersection, difference) and finding overlaps.

##### two ranges
# Create a second set of intervals
r2 = IRanges(start=c(8, 9, 15), end=c(10, 15, 17))
r2

## IRanges object with 3 ranges and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         8        10         3
##   [2]         9        15         7
##   [3]        15        17         3

par(mar = c(2.5, 2.5, 2.6, 1.1), mgp = c(5.5, 1.5, 0), mfrow=c(5,1))
plotRanges(r, xlim=c(-2,27), col="grey")
plotRanges(r2, xlim=c(-2,27), col="grey")
# union: intervals that are in r OR r2
plotRanges(union(r, r2), xlim=c(-2,27), col="grey")
# intersect: intervals that are in BOTH r AND r2
plotRanges(intersect(r, r2), xlim=c(-2,27), col="grey")
# setdiff: intervals in r that do NOT overlap with r2
plotRanges(setdiff(r, r2), xlim=c(-2,27), col="grey")

par(mar = c(2.5, 2.5, 2.6, 1.1), mgp = c(5.5, 1.5, 0), mfrow=c(2,1))
plotRanges(r, xlim=c(-2,27), col="grey")
plotRanges(r2, xlim=c(-2,27), col="grey")

# Count how many intervals in r2 overlap with each interval in r
countOverlaps(r, r2)

## [1] 0 0 2 3

# Logical vector: TRUE if an interval in r overlaps any interval in r2
r %over% r2

## [1] FALSE FALSE  TRUE  TRUE

# Detailed matrix/object showing exact indices of which ranges in r overlap with which ranges in r2
findOverlaps(r, r2)

## Hits object with 5 hits and 0 metadata columns:
##       queryHits subjectHits
##       <integer>   <integer>
##   [1]         3           2
##   [2]         3           3
##   [3]         4           1
##   [4]         4           2
##   [5]         4           3
##   -------
##   queryLength: 4 / subjectLength: 3

Run Length Encoding (Rle)

Rle is a data compression method frequently used in genomics (e.g., for coverage vectors) where consecutive identical values are stored as a single value and a run length.

###### show Rle
# Create Rle directly from a vector
x <- Rle(c(1,1,2,2,2))
x

## numeric-Rle of length 5 with 2 runs
##   Lengths: 2 3
##   Values : 1 2

# Create Rle by specifying the values and how long they run
x <- Rle(values=c(1,2), lengths=c(2,3))
x

## numeric-Rle of length 5 with 2 runs
##   Lengths: 2 3
##   Values : 1 2

# Rle works with characters too
x <- Rle(values=c("a","b","c"), lengths=c(2,3,4))
x

## character-Rle of length 9 with 3 runs
##   Lengths:   2   3   4
##   Values : "a" "b" "c"

# Decompress back to a standard character vector
as.character(x)

## [1] "a" "a" "b" "b" "b" "c" "c" "c" "c"

x <- Rle(c(1,1,2,2,2))
# Accessor functions for Rle objects
length(x)     # Total decompressed length

## [1] 5

start(x)      # Start indices of each run

## [1] 1 3

end(x)        # End indices of each run

## [1] 2 5

width(x)      # Length of each run

## [1] 2 3

nrun(x)       # Total number of runs (2 in this case)

## [1] 2

runLength(x)  # Vector containing the length of each run

## [1] 2 3

Intro to GenomicRanges

GenomicRanges (GRanges) extends IRanges by adding biological context: chromosome names (seqnames) and strands (+/-). It also allows metadata columns (like scores or GC content).

#####################################################
## GenomicRanges
#####################################################
library(GenomicRanges)

# Create a GRanges object combining chromosomes, intervals, strands, and metadata
gr <- GRanges(
  seqnames = Rle(c("chr1", "chr2"), c(2, 3)),
  ranges = IRanges(start=1:5, end = 6:10),
  strand = Rle(strand(c("-", "+", "+","-")), c(1,1,2,1)),
  score = 1:5, GC = seq(1, 0, length = 5) # Metadata columns
)
gr

## GRanges object with 5 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr1       1-6      - |         1      1.00
##   [2]     chr1       2-7      + |         2      0.75
##   [3]     chr2       3-8      + |         3      0.50
##   [4]     chr2       4-9      + |         4      0.25
##   [5]     chr2      5-10      - |         5      0.00
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths

# Visualize the GRanges object using ggbio
plotRanges(gr)

## operate GRanges object
# Standard accessors now include genomic context
length(gr)

## [1] 5

seqnames(gr)          # Chromosome information

## factor-Rle of length 5 with 2 runs
##   Lengths:    2    3
##   Values : chr1 chr2
## Levels(2): chr1 chr2

ranges(gr)            # The underlying IRanges coordinates

## IRanges object with 5 ranges and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         1         6         6
##   [2]         2         7         6
##   [3]         3         8         6
##   [4]         4         9         6
##   [5]         5        10         6

strand(gr)            # Strand information

## factor-Rle of length 5 with 3 runs
##   Lengths: 1 3 1
##   Values : - + -
## Levels(3): + - *

elementMetadata(gr)   # Extract the attached metadata (score, GC)

## DataFrame with 5 rows and 2 columns
##       score        GC
##   <integer> <numeric>
## 1         1      1.00
## 2         2      0.75
## 3         3      0.50
## 4         4      0.25
## 5         5      0.00

## subsetting and combining
gr[1:2]          # Extract first two ranges

## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr1       1-6      - |         1      1.00
##   [2]     chr1       2-7      + |         2      0.75
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths

c(gr[1], gr[3])  # Concatenate the 1st and 3rd ranges

## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr1       1-6      - |         1       1.0
##   [2]     chr2       3-8      + |         3       0.5
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths

GRanges Operations and Visualization

GenomicRanges supports the same range operations as IRanges, but it automatically respects boundaries like chromosomes and strands (e.g., intervals on chr1 will never merge with intervals on chr2).

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union

## The following object is masked from 'package:Seqinfo':
## 
##     intersect

## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union

## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union

## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, setequal, union

## The following object is masked from 'package:generics':
## 
##     explain

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## other useful functions
# Coverage calculation across chromosomes
coverage(gr)

## RleList of length 2
## $chr1
## integer-Rle of length 7 with 3 runs
##   Lengths: 1 5 1
##   Values : 1 2 1
## 
## $chr2
## integer-Rle of length 10 with 6 runs
##   Lengths: 2 1 1 4 1 1
##   Values : 0 1 2 3 2 1

# Reduce, disjoin, flank, and gaps. Notice the use of the %>% pipe for plotting
reduce(gr) %>% plotRanges()

disjoin(gr) %>% plotRanges()

flank(gr, width = 3)  %>% plotRanges()

# gaps() finds the empty spaces between intervals on the same chromosome/strand
gaps(gr) %>% plotRanges()

GRanges Set Operations and Overlaps

Demonstrating overlaps and set theory across two GRanges objects. Operations only evaluate against matching chromosomes and strands.

## set operations
gr1 <- GRanges(
  seqnames = Rle("chr1", 2),
  ranges=IRanges(start=c(1,10), end = c(5,15))
)
gr2 <- GRanges(
  seqnames = Rle("chr1", 1),
  ranges = IRanges(start=3, end = 12)
)

# Plot combined ranges
plotRanges(c(gr1, gr2))

# Perform and plot set operations
union(gr1, gr2)  %>% plotRanges()

intersect(gr1, gr2) %>% plotRanges()

setdiff(gr1, gr2) %>% plotRanges()

## find overlaps
# Find detailed hits between gr1 and gr2
findOverlaps(gr1, gr2)

## Hits object with 2 hits and 0 metadata columns:
##       queryHits subjectHits
##       <integer>   <integer>
##   [1]         1           1
##   [2]         2           1
##   -------
##   queryLength: 2 / subjectLength: 1

# Logical check for overlaps
gr1 %over% gr2

## [1] TRUE TRUE

GRangesList

A GRangesList is a container for holding multiple GRanges objects, often used to group transcripts by gene or exons by transcript.

##### GRangesList
# Combine multiple GRanges into a single list object
glist = GRangesList(gr1, gr2)
glist

## GRangesList object of length 2:
## [[1]]
## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1       1-5      *
##   [2]     chr1     10-15      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
## 
## [[2]]
## GRanges object with 1 range and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1      3-12      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

glist[[1]] # Access the first GRanges object in the list

## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1       1-5      *
##   [2]     chr1     10-15      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

glist[[2]] # Access the second GRanges object in the list

## GRanges object with 1 range and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1      3-12      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

# Apply functions over the list elements
lapply(glist, length) # Returns a standard list of lengths

## [[1]]
## [1] 2
## 
## [[2]]
## [1] 1

sapply(glist, length) # Returns a simplified vector of lengths

## [1] 2 1

GenomicFeatures and Transcript Databases (TxDb)

GenomicFeatures allows you to interact with transcript databases (like those from UCSC). Note: The saveDb/loadDb operations use a local file path which you will need to adjust for your system.

#####################################################
## GenomicFeatures
#####################################################
library(GenomicFeatures)
library(txdbmaker)

## from UCSC
# Check which UCSC tracks are supported for generating databases
supportedUCSCtables()
# Build a Transcript Database (TxDb) straight from the UCSC knownGene table for hg19
txdb = makeTxDbFromUCSC(genome="hg19", tablename="knownGene")
txdb

## save and load
# NOTE: Update 'dataDir' to a valid directory on your machine before running
dataDir <- "/Users/mdozmorov/Documents/Work/Teaching/BIOS668.2018/assets/data"
# Save the database locally as an SQLite file to avoid downloading it again later
saveDb(txdb, file=paste0(dataDir, "/hg19_knownGenes.sqlite")) # Takes a few minutes
# Load the saved database
txdb = loadDb(paste0(dataDir, "/hg19_knownGenes.sqlite"))
txdb

## make TxDb package - this gives a package directory in current folder
# (Commented out) This generates an actual installable R package containing your TxDb
# makeTxDbPackageFromUCSC(maintainer="Hao Wu <hao.wu@emory.edu>", author="Hao Wu",
#                         version="1.0", genome="hg19",tablename="knownGene")

## retrieve basic features
# Extract all transcripts as a GRanges object
transcripts(txdb)
# Extract transcripts, filtering only for chromosome 1
transcripts(txdb, filter=list(tx_chrom="chr1"))
# Extract all exons
exons(txdb)

# Group features into GRangesLists (e.g., all transcripts grouped by their parent gene)
transcriptsBy(txdb, by="gene")
exonsBy(txdb, by="gene")
intronsByTranscript(txdb)
fiveUTRsByTranscript(txdb)
threeUTRsByTranscript(txdb)

## by overlap
# Define a region of interest
gr = GRanges(
  seqnames = Rle("chr1", 2),
  ranges=IRanges(start=c(100000,500000), end = c(200000,600000))
)
gr
# Retrieve only the transcripts that overlap with our region of interest
transcriptsByOverlaps(txdb, gr)