Biostrings and Genomic Data Demo

Introduction

This document demonstrates the core functionalities of the Biostrings package and related Bioconductor packages for sequence analysis, genomic range manipulation, and variant annotation.

Basic BString Operations

The Biostrings package provides efficient containers for string manipulation. The BString class is a general-purpose container for large strings.

library(Biostrings)

## Loading required package: BiocGenerics

## Loading required package: generics

## 
## Attaching package: 'generics'

## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
##     unsplit, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: XVector

## Loading required package: Seqinfo

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

# Create a basic Biostring object
a = BString("I am a string!") 
a

## 14-letter BString object
## seq: I am a string!

# Get the number of characters in the string
length(a)

## [1] 14

# Extract the first 4 characters using standard R indexing
a[1:4]

## 4-letter BString object
## seq: I am

# Extract a subsequence using the subseq function
subseq(a, 1, 4)

## 4-letter BString object
## seq: I am

# Extract a subsequence from the start up to the 4th character
subseq(a, end=4)

## 4-letter BString object
## seq: I am

# Reverse the string (note: this simply reverses characters, it is NOT a biological reverse complement)
rev(a)

## 14-letter BString object
## seq: !gnirts a ma I

# Logical comparisons
a == "I am a string!" # Returns TRUE

## [1] TRUE

a[1:4] == "I am"      # Returns TRUE

## [1] TRUE

# Convert the BString back to a standard R character vector
toString(a)

## [1] "I am a string!"

# Check the object classes
class(a)              # "BString"

## [1] "BString"
## attr(,"package")
## [1] "Biostrings"

class(toString(a))    # "character"

## [1] "character"

DNA and RNA Strings

For biological data, it is better to use DNAString or RNAString, which enforce specific biological alphabets and enable specialized operations.

## Inspect built-in biological alphabets and maps
IUPAC_CODE_MAP # Dictionary of IUPAC ambiguity codes

##      A      C      G      T      M      R      W      S      Y      K      V 
##    "A"    "C"    "G"    "T"   "AC"   "AG"   "AT"   "CG"   "CT"   "GT"  "ACG" 
##      H      D      B      N 
##  "ACT"  "AGT"  "CGT" "ACGT"

DNA_ALPHABET   # Allowed characters in a DNAString

##  [1] "A" "C" "G" "T" "M" "R" "W" "S" "Y" "K" "V" "H" "D" "B" "N" "-" "+" "."

DNA_BASES      # The four core DNA bases (A, C, G, T)

## [1] "A" "C" "G" "T"

RNA_ALPHABET   # Allowed characters in an RNAString

##  [1] "A" "C" "G" "U" "M" "R" "W" "S" "Y" "K" "V" "H" "D" "B" "N" "-" "+" "."

RNA_BASES      # The four core RNA bases (A, C, G, U)

## [1] "A" "C" "G" "U"

# Attempting to create a DNAString with non-DNA characters generates an error/warning
# a = DNAString("I am a string") # (Intentionally commented out to prevent error if run)

# Create a valid DNA string
a = DNAString("ATTGCC")
a

## 6-letter DNAString object
## seq: ATTGCC

# Create a valid RNA string
# b = RNAString("ATTGCC") # Warning/Error in strict modes, 'T' is not standard RNA
b = RNAString("AUUGCC") # Correct RNA string
b = RNAString(a)      # Converts the DNAString 'a' into an RNAString (T becomes U automatically)
b

## 6-letter RNAString object
## seq: AUUGCC

# Explore available methods for the DNAStringSet class
methods(class="DNAStringSet") |> head()

## [1] "!,List-method"           "!=,ANY,Vector-method"   
## [3] "!=,Vector,ANY-method"    "!=,Vector,Vector-method"
## [5] "[,List-method"           "[[,List-method"

# Bring up the help page for DNAStringSet
?"DNAStringSet"
# Open the Biostrings package vignettes in the browser
# browseVignettes(package="Biostrings") 

b

## 6-letter RNAString object
## seq: AUUGCC

## 6-letter DNAString object
## seq: ATTGCC

length(a) # Length of the DNA sequence

## [1] 6

a[1:3]    # Subset the first 3 nucleotides

## 3-letter DNAString object
## seq: ATT

Sequence Frequencies and Complements

Biostrings provides optimized functions to count base frequencies and compute complements.

## frequencies
# Count occurrences of all letters in the DNA alphabet
alphabetFrequency(a)

## A C G T M R W S Y K V H D B N - + . 
## 1 2 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0

# Count occurrences of only the primary bases (A, C, G, T), ignoring ambiguous bases
alphabetFrequency(a, baseOnly=TRUE)

##     A     C     G     T other 
##     1     2     1     2     0

# Count the frequency of a specific letter
letterFrequency(a, "C")

## C 
## 2

# Count the combined frequency of C and G
letterFrequency(a, "CG")

## C|G 
##   3

# Count C and G frequencies in a sliding window of width 3 across the sequence
letterFrequencyInSlidingView(a, view.width = 3, letters = "GC")

##      G|C
## [1,]   0
## [2,]   1
## [3,]   2
## [4,]   3

# Calculate frequencies of all 16 possible dinucleotides
dinucleotideFrequency(a)

## AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT 
##  0  0  0  1  0  1  0  0  0  1  0  0  0  0  1  1

## complements
a

## 6-letter DNAString object
## seq: ATTGCC

# Generate the sequence complement (A->T, C->G)
complement(a)

## 6-letter DNAString object
## seq: TAACGG

# Generate the reverse complement (Standard for finding the opposite DNA strand)
reverseComplement(a)

## 6-letter DNAString object
## seq: GGCAAT

String Distances and Pattern Matching

This chunk calculates the Levenshtein distance (edit distance) between sequences, clusters them, and demonstrates finding substrings with optional mismatch tolerance.

######## String matching
a = DNAString("ACGTACGTACGC")
b = DNAString("ACCTACGTAGGG")
c = DNAString("AAACCCTTTGGG")

# Combine into a DNAStringSet. Start/End define bounds to extract from the strings.
abc <- DNAStringSet(c(a, b, c), start = c(1, 13, 25), end = c(12, 24, 36))
abc

## DNAStringSet object of length 3:
##     width seq
## [1]    12 ACGTACGTACGC
## [2]    12 ACCTACGTAGGG
## [3]    12 AAACCCTTTGGG

# Compute pairwise string distances using Levenshtein (edit distance)
library(pwalign) # pairwise sequence alignments

## 
## Attaching package: 'pwalign'

## The following objects are masked from 'package:Biostrings':
## 
##     aligned, alignedPattern, alignedSubject, compareStrings, deletion,
##     errorSubstitutionMatrices, indel, insertion, mismatchSummary,
##     mismatchTable, nedit, nindel, nucleotideSubstitutionMatrix,
##     pairwiseAlignment, PairwiseAlignments,
##     PairwiseAlignmentsSingleSubject, pattern, pid,
##     qualitySubstitutionMatrices, stringDist, unaligned,
##     writePairwiseAlignments

sdist <- stringDist(abc, method = "levenshtein")
sdist # Returns an object of class "dist".

##   1 2
## 2 3  
## 3 8 6

# Perform hierarchical clustering on the distance matrix and plot it
clust <- hclust(sdist, method = "average")
plot(clust)

# Pattern matching with mismatches
a = DNAString("TAGCTTATCAGACTGATGTTGACAGATCGGAAGAGCACACGTCTGAACTCC")
b = DNAString("AAGATCGGAAGAGCACACGTCT")

# ?matchPattern
# Find b in a, allowing up to 1 mismatch
matchPattern(b, a, max.mismatch = 1)

## Views on a 51-letter DNAString subject
## subject: TAGCTTATCAGACTGATGTTGACAGATCGGAAGAGCACACGTCTGAACTCC
## views:
##       start end width
##   [1]    23  44    22 [CAGATCGGAAGAGCACACGTCT]

## 51-letter DNAString object
## seq: TAGCTTATCAGACTGATGTTGACAGATCGGAAGAGCACACGTCTGAACTCC

# Find exact matches of "CGT" in a
matchPattern("CGT", a)

## Views on a 51-letter DNAString subject
## subject: TAGCTTATCAGACTGATGTTGACAGATCGGAAGAGCACACGTCTGAACTCC
## views:
##       start end width
##   [1]    40  42     3 [CGT]

# Find matches of "CGT" allowing 1 mismatch
matchPattern("CGT", a, max.mismatch=1)

## Views on a 51-letter DNAString subject
## subject: TAGCTTATCAGACTGATGTTGACAGATCGGAAGAGCACACGTCTGAACTCC
## views:
##       start end width
##   [1]     4   6     3 [CTT]
##   [2]    17  19     3 [TGT]
##   [3]    28  30     3 [CGG]
##   [4]    40  42     3 [CGT]

# Store the match object and extract its properties
m = matchPattern("CGT", a, max.mismatch=1)
m

## Views on a 51-letter DNAString subject
## subject: TAGCTTATCAGACTGATGTTGACAGATCGGAAGAGCACACGTCTGAACTCC
## views:
##       start end width
##   [1]     4   6     3 [CTT]
##   [2]    17  19     3 [TGT]
##   [3]    28  30     3 [CGG]
##   [4]    40  42     3 [CGT]

start(m)  # Start positions of matches

## [1]  4 17 28 40

end(m)    # End positions of matches

## [1]  6 19 30 42

length(m) # Number of matches found

## [1] 4

# Just count the matches without returning their coordinates
countPattern("CGT", a, max.mismatch=1)

## [1] 4

Dictionaries and Position Weight Matrices (PWM)

For more complex matching, you can match multiple patterns at once using dictionaries, or score sequences using a PWM.

## multiple matching
a = DNAString("ACGTACGTACGC")
# Create a dictionary of multiple patterns to search for simultaneously
dict0 = PDict(c("CGT", "ACG"))
# Find all occurrences of the dictionary patterns in the subject sequence
mm = matchPDict(dict0, a)
mm

## MIndex object of length 2
## [[1]]
## IRanges object with 2 ranges and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         2         4         3
##   [2]         6         8         3
## 
## [[2]]
## IRanges object with 3 ranges and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         1         3         3
##   [2]         5         7         3
##   [3]         9        11         3

mm[[1]] # Matches for the first dictionary item ("CGT")

## IRanges object with 2 ranges and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         2         4         3
##   [2]         6         8         3

mm[[2]] # Matches for the second dictionary item ("ACG")

## IRanges object with 3 ranges and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         1         3         3
##   [2]         5         7         3
##   [3]         9        11         3

## matching using PWM (Position Weight Matrix)
# Define a manual PWM (4 rows for A,C,G,T and columns representing positions)
motif = matrix(c(0.97, 0.01, 0.01, 0.01, 0.1, 0.5, 0.39, 0.01, 0.01, 0.05, 0.5, 0.44), nrow=4)
rownames(motif) = c("A", "C", "G", "T")
motif

##   [,1] [,2] [,3]
## A 0.97 0.10 0.01
## C 0.01 0.50 0.05
## G 0.01 0.39 0.50
## T 0.01 0.01 0.44

a = DNAString("ACGTACGTACTC")
# Find sequence regions that match the PWM well enough
matchPWM(motif, a)

## Views on a 12-letter DNAString subject
## subject: ACGTACGTACTC
## views:
##       start end width
##   [1]     1   3     3 [ACG]
##   [2]     5   7     3 [ACG]
##   [3]     9  11     3 [ACT]

# Count how many regions match the PWM
countPWM(motif, a)

## [1] 3

# Score the sequence against the PWM starting at specific positions (1 through 10)
PWMscoreStartingAt(motif, a, 1:10)

##  [1] 1.97 0.84 0.03 0.16 1.97 0.84 0.03 0.16 1.91 0.07

XStringViews vs. DNAStringSet

Views define specific segments of a single sequence without duplicating it in memory. DNAStringSet is a collection of distinct sequences.

######### multiple strings: XStringViews
a = DNAString("ACGTACGTACTC")
# Create 'Views' on sequence 'a', defining three distinct windows
a2 = Views(a, start=c(1,5,8), end=c(3,8,12))
a2

## Views on a 12-letter DNAString subject
## subject: ACGTACGTACTC
## views:
##       start end width
##   [1]     1   3     3 [ACG]
##   [2]     5   8     4 [ACGT]
##   [3]     8  12     5 [TACTC]

subject(a2) # The underlying base sequence the views reference

## 12-letter DNAString object
## seq: ACGTACGTACTC

length(a2)  # Number of views

## [1] 3

start(a2)   # Start coordinates

## [1] 1 5 8

end(a2)     # End coordinates

## [1]  3  8 12

# Calculate frequencies across all views
alphabetFrequency(a2, baseOnly=TRUE)

##      A C G T other
## [1,] 1 1 1 0     0
## [2,] 1 1 1 1     0
## [3,] 1 2 0 2     0

a2

## Views on a 12-letter DNAString subject
## subject: ACGTACGTACTC
## views:
##       start end width
##   [1]     1   3     3 [ACG]
##   [2]     5   8     4 [ACGT]
##   [3]     8  12     5 [TACTC]

# Logical check on the views
a2 == DNAString("ACGT")

## [1] FALSE  TRUE FALSE

toString(a2)

## [1] "ACG, ACGT, TACTC"

## DNAStringSet
a = DNAString("ACGTACGTACTC")
# Instead of views, create a distinct set of strings extracted from 'a'
a2 = DNAStringSet(a, start=c(1,5,9), end=c(4,8,12))
a2

## DNAStringSet object of length 3:
##     width seq
## [1]     4 ACGT
## [2]     4 ACGT
## [3]     4 ACTC

a2[[1]] # Access the first string in the set

## 4-letter DNAString object
## seq: ACGT

alphabetFrequency(a2, baseOnly=TRUE)

##      A C G T other
## [1,] 1 1 1 1     0
## [2,] 1 1 1 1     0
## [3,] 1 2 0 1     0

Operations Allowed on StringSets but not Views

Certain mathematical set operations and vectorized matching functions are only designed for StringSet objects.

###### Operations only allowed for StringSet but not StringViews
a = DNAString("ACGTACGTACTC")
a1 = DNAStringSet(a, start=c(1,5,9), end=c(4,8,12))
a1

## DNAStringSet object of length 3:
##     width seq
## [1]     4 ACGT
## [2]     4 ACGT
## [3]     4 ACTC

unique(a1) # Returns unique sequences in the set

## DNAStringSet object of length 2:
##     width seq
## [1]     4 ACGT
## [2]     4 ACTC

a2 = Views(a, start=c(1,5,9), end=c(4,8,12))
a2

## Views on a 12-letter DNAString subject
## subject: ACGTACGTACTC
## views:
##       start end width
##   [1]     1   4     4 [ACGT]
##   [2]     5   8     4 [ACGT]
##   [3]     9  12     4 [ACTC]

# unique(a2) # Works only for Sets, running this on Views generates an error

a1 = Views(a, start=c(1,9), end=c(4,12))
a1

## Views on a 12-letter DNAString subject
## subject: ACGTACGTACTC
## views:
##       start end width
##   [1]     1   4     4 [ACGT]
##   [2]     9  12     4 [ACTC]

a2 = Views(a, start=c(1), end=c(4))
a2

## Views on a 12-letter DNAString subject
## subject: ACGTACGTACTC
## views:
##       start end width
##   [1]     1   4     4 [ACGT]

# setdiff(a1, a2) ## this will generate error because Views don't support set algebra
# union(a1, a2)   ## error

### set operations are allowed for StringSet
a1 = DNAStringSet(a, start=c(1,9), end=c(4,12))
a1

## DNAStringSet object of length 2:
##     width seq
## [1]     4 ACGT
## [2]     4 ACTC

a2 = DNAStringSet(a, start=c(1), end=c(4))
a2

## DNAStringSet object of length 1:
##     width seq
## [1]     4 ACGT

setdiff(a1, a2) # Find sequences in a1 not in a2

## DNAStringSet object of length 1:
##     width seq
## [1]     4 ACTC

union(a1, a2)   # Combine unique sequences from a1 and a2

## DNAStringSet object of length 2:
##     width seq
## [1]     4 ACGT
## [2]     4 ACTC

####### matching for multiple strings
a = DNAString("ACGTACGTACTC")
a2 = DNAStringSet(a, start=c(1,5,9), end=c(4,8,12))
a2

## DNAStringSet object of length 3:
##     width seq
## [1]     4 ACGT
## [2]     4 ACGT
## [3]     4 ACTC

# Vectorized match pattern - searches for "CG" across all strings in the set
vv = vmatchPattern("CG", a2)
vv

## MIndex object of length 3
## [[1]]
## IRanges object with 1 range and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         2         3         2
## 
## [[2]]
## IRanges object with 1 range and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         2         3         2
## 
## [[3]]
## IRanges object with 0 ranges and 0 metadata columns:
##        start       end     width
##    <integer> <integer> <integer>

vv[[1]] # Matches found in the first string

## IRanges object with 1 range and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]         2         3         2

## doesn't work for Views
a2 = Views(a, start=c(1,5,9), end=c(4,8,12))
a2

## Views on a 12-letter DNAString subject
## subject: ACGTACGTACTC
## views:
##       start end width
##   [1]     1   4     4 [ACGT]
##   [2]     5   8     4 [ACGT]
##   [3]     9  12     4 [ACTC]

# vv = vmatchPattern("CG", a2) # Generates an error, vmatchPattern requires a StringSet

BSgenome and Reference Data Packages

BSgenome packages contain full reference genomes. This demonstrates how to interact with full chromosomes.

#####################################################
## Now show BSgenome and data packages
#####################################################
# Note: Ensure BSgenome and BSgenome.Hsapiens.UCSC.hg38 are installed
library(BSgenome)

# List all available genomes you have installed
available.genomes()

# Load the Human reference genome (hg38)
library(BSgenome.Hsapiens.UCSC.hg38)
# List objects available within this package
ls("package:BSgenome.Hsapiens.UCSC.hg38")

Hsapiens
Hsapiens$chr1 # Extract full sequence for Chromosome 1

seqnames(Hsapiens) # List all chromosome names
seqlengths(Hsapiens)[1:25] # View the length (in bp) of the first 25 chromosomes

# Slice out a specific 11bp region from Chromosome 1
Hsapiens$chr1[100000000:100000010]

# Calculate the base frequencies across all of Chromosome 1
alphabetFrequency(Hsapiens$chr1, baseOnly=TRUE)
# Get the relative proportions of each base on Chromosome 1
alphabetFrequency(Hsapiens$chr1, baseOnly=TRUE) / length(Hsapiens$chr1)

# Find all occurrences of the "CG" dinucleotide on Chromosome 1
mm = matchPattern("CG", Hsapiens$chr1)
length(mm) # Count of CG sites
mm

Masked Genomes and SNPs

Masked genomes block out repetitive or uncharacterized regions. SNP packages inject known variant locations into the reference genome.

### masking
# Note: Requires BSgenome.Hsapiens.UCSC.hg38.masked
library(BSgenome.Hsapiens.UCSC.hg38.masked)

# Accessing a masked chromosome reveals active masks (e.g., against repeats)
Hsapiens[["chr1"]]
Hsapiens$chr1

## SNPs
# List available and installed SNP packages
available.SNPs()
installed.SNPs()

# Inject SNPs into the reference genome to create a variant-aware genome object
# (Commented out because the exact dbSNP package needs to be installed locally)
# SnpHsapiens <- injectSNPs(Hsapiens, "SNPlocs.Hsapiens.dbSNP.20120608")
# SnpHsapiens

# snpcount(SnpHsapiens) # Count SNPs injected
# head(snplocs(SnpHsapiens, "chr1")) # View the first few SNP locations on chr1

IUPAC_CODE_MAP

Genomic Ranges and Alignments

Using GenomicRanges and GenomicAlignments to find sequencing reads that span or support specific biological events (like splice junctions).

# Find reads supporting the junction identified above, at position
# 19653707 + 66M = 19653773 of chromosome 14

library(GenomicRanges)
library(GenomicAlignments)
library(Rsamtools)

## our 'region of interest' (ROI)
IRanges(start = 19653773, width=1)
# Create a GRanges object mapping the ROI to chromosome 14
roi <- GRanges("chr14", IRanges(start = 19653773, width=1)) 
roi

## sample data
# Load sample BAM file containing RNA-seq data
library('RNAseqData.HNRNPC.bam.chr14')
bf <- BamFile(RNAseqData.HNRNPC.bam.chr14_BAMFILES[[1]], asMates=TRUE)
bf

## alignments, junctions, overlapping our roi
# ?readGAlignmentsList
# Read paired-end alignments into memory
paln <- readGAlignmentsList(bf)
paln

# ?summarizeJunctions
# Extract and summarize splice junctions from the alignments
j <- summarizeJunctions(paln, with.revmap=TRUE)
j

# Subset junctions to find only the ones overlapping our Region of Interest
j_overlap <- j[j %over% roi]
j_overlap

## supporting reads
# Use the reverse map to extract the actual reads that support the junction at the ROI
paln[j_overlap$revmap[[1]]]

Variant Annotation

This final chunk takes variants from a standard VCF file and determines their functional impact by intersecting them with a known database of gene models.

# Read variants from a VCF file, and annotate with respect to a known gene model

## input variants
library(VariantAnnotation)
# Locate a sample VCF file shipped with the VariantAnnotation package
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
# Read the VCF file, specifying it aligns to human genome build hg19
vcf <- readVcf(fl, "hg19")
# Subset to focus only on chromosome 22
seqlevels(vcf) <- "chr22"

## known gene model
# Load the reference database of transcripts for hg19
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

# Intersect the variants (from the VCF) with the known gene models (TxDb) 
# Specifically look for coding variants (e.g., synonymous, non-synonymous)
coding <- locateVariants(
  rowRanges(vcf),
  TxDb.Hsapiens.UCSC.hg19.knownGene,
  CodingVariants()
)

# View the resulting variant annotations
head(coding)