Principal Component Analysis

PCA is a great way to explore your sequencing data, assess structure, patterns, or discover plate inversions. Yay. Generally it reduces the dimensionality of data and provides a way to view that data along two axes.

IBS PCA

The newest angsd version (0.918+) provides a way to do PCA without subsampling your data. This creates a covariance matrix bt sampling a single read at each polymorphic site. The basic angsd call for this is as follows:

~/bin/angsd -bam ${bamlist} -out ${out}.PCAMDS1 -doIBS 1 -doCounts 1 -doMajorMinor 1 -minFreq 0.05 -maxMis ${nind} -minMapQ 30 -minQ 20 -SNP_pval 0.000001 -makeMatrix 1 -doCov 1 -GL 1 -doMaf 2 -doPost 2 -nThreads 16

Notes:

  • when running with doIBS give extra time/memory. This method can take a long time because it uses all sites in the data.
  • if you’ve already run the subsample script, one option is to ls all bamfiles that have a certain subsampling threshold (say 100k), and then sed out the 100k part of the filename, so you match the the original bam file name, but only keep bams that have read counts 100k or greater. This is a good way to use all the data, but only for samples that have great than a certain number of reads.
  • The post-processing scripts for plotting should be largely the same.

ngsCovar PCA with subsampling

This method uses the other genotype calling option and ngsCovar.

PCA_CALC.sh:


#!/bin/bash -l

#SBATCH -o slurm_outs/03_pca_calc-%j.out
#SBATCH -J pca_calc_sites

echo "Starting Job: "
date

bamlist=$1
out=$2
bamdir=$3 

nInd=$(wc ${bamdir}/${bamlist} | awk '{print $1}')
minInd=$[$nInd/2]
outdir='angsd_output'

angsd -bam ${bamdir}/${bamlist} -out ${outdir}/${out} -minQ 20 -minMapQ 10 -minInd $minInd -GL 1 -doMajorMinor 1 -doMaf 2 -doPost 2 -doGeno 32 -minMaf 0.05 -postCutoff 0.95 -SNP_pval 1e-6 -sites bait_lengths.txt

gunzip ${outdir}/${out}.*.gz

count=$(sed 1d ${outdir}/${out}.mafs | wc -l | awk '{print $1}')

ngsCovar -probfile ${outdir}/${out}.geno -outfile ${outdir}/${out}.covar -nind $nInd -nsites $count -call 1

echo "Ending Job: "
date

To plot, you’ll need to assign IDs using a clst file. I use the following steps (assuming we are using the 100k subsampled bamlist):

  1. Use sed to create a clst file of the samples/bamnames (i.e., sed 's/_RA\.sort\.flt_100000\.bam/ 1 1/g' subbamlist > out.clst)
  2. VIM (or other text editor) to edit the out.clst and insert a header FID IID CLUSTER:
    • (IID changes symbol; CLUSTER changes color)
    • Feel free to modify this file according to symbol or color as you see fit for the final PCA plot
  3. Use pca_plot.sh to create 3 pdfs (pc1 vs pc2, pc1 v pc3, pc2 vs. pc3). It can be modified.

PCA_PLOT


#!/bin/bash -l

#SBATCH -o slurm_outs/04_pca_pdf-%j.out
#OUTDIR=pca_pdfs
#SBATCH -J PCA_plots

echo "Starting Job: "
date

out=$1
bamclst=$2

outdir='pca_pdfs'
homedir='/home/fastq'

Rscript --vanilla --slave  ${homedir}/pca_makeplot.R -i ./angsd_output/${out}.covar -c 1-2 -a ${homedir}/${bamclst} -o ${outdir}/${out}_12_pca.pdf
Rscript --vanilla --slave ${homedir}/pca_makeplot.R -i ./angsd_output/${out}.covar -c 1-3 -a ${homedir}/${bamclst} -o ${outdir}/${out}_13_pca.pdf
Rscript --vanilla --slave ${homedir}/pca_makeplot.R -i ./angsd_output/${out}.covar -c 2-3 -a ${homedir}/${bamclst} -o ${outdir}/${out}_23_pca.pdf

echo "Ending Job: "
date

Rscript to Plot PCA

#!/usr/bin/Rscript
# Usage: Rscript -i infile.covar -c component1-component2 -a annotation.file -o outfile.eps

#.libPaths("~/R/x86_64-pc-linux-gnu-library/3.2")

library(optparse)
library(ggplot2)
library(stringi)
library(viridis)

#if(!require(viridis)) { 
#  install.packages("viridis"); require(viridis)} #load / install+load installr

option_list <- list(make_option(c('-i','--in_file'), action='store', type='character', default=NULL, help='Input file (output from ngsCovar)'),
                    make_option(c('-c','--comp'), action='store', type='character', default=1-2, help='Components to plot'),
                    make_option(c('-a','--annot_file'), action='store', type='character', default=NULL, help='Annotation file with individual classification (2 column TSV with ID and ANNOTATION)'),
                    make_option(c('-o','--out_file'), action='store', type='character', default=NULL, help='Output file')
                    )
opt <- parse_args(OptionParser(option_list = option_list))

# Annotation file is in plink cluster format

#################################################################################

# Read input file
covar <- read.table(opt$in_file, stringsAsFact=F);

# Read annot file
annot <- read.table(opt$annot_file, sep=" ", header=T); # note that plink cluster files are usually tab-separated instead

# Parse components to analyze
comp <- as.numeric(strsplit(opt$comp, "-", fixed=TRUE)[[1]])

# Eigenvalues
eig <- eigen(covar, symm=TRUE);
eig$val <- eig$val/sum(eig$val);
cat(signif(eig$val, digits=3)*100,"\n");


# Write eigenvalues
#write.table(eig, file = "eigen_scores.txt", quote = FALSE)

# Plot
PC <- as.data.frame(eig$vectors)
colnames(PC) <- gsub("V", "PC", colnames(PC))
PC$River <- factor(strtrim(annot$FID,5)) # adjust/add a column of interest here from metadata
PC$Pop <- factor(annot$CLUSTER)
PC$ID <- factor(annot$IID)
PC$IDname <- factor(stringr::str_extract(annot$FID, pattern = "(?<=_)([A-Z]{2})")) # change to match your naming convention

# Write PC components

#write.table(PC, file = "PC_scores.txt", quote = FALSE)

title <- paste("PC",comp[1]," (",signif(eig$val[comp[1]], digits=3)*100,"%)"," / PC",comp[2]," (",signif(eig$val[comp[2]], digits=3)*100,"%)",sep="",collapse="")

x_axis = paste("PC",comp[1],sep="")
y_axis = paste("PC",comp[2],sep="")

ggplot() + geom_point(data=PC, aes_string(x=x_axis, y=y_axis, color="IDname", shape="River"), size=5) + 
        geom_text(data=PC, aes_string(x=x_axis, y=y_axis, label="IDname", vjust= -0.784), check_overlap=TRUE, cex=2.5, hjust=-0.3) + 
        ggtitle(title) + theme_bw() + scale_color_viridis(discrete=TRUE)
ggsave(opt$out_file)
unlink("Rplots.pdf", force=TRUE)

Dynamic PCA

For data exploration, you can use the following script to make an interactive or dynamic plot, which is useful for investigating your data. You’ll need to grab the .covar file, and have the .clst file from your original call.

# LIBRARIES ---------------------------------------------------------------

suppressPackageStartupMessages({
  library(tidyverse); # for ggplot2 and dplyr
  library(viridis); # for color palettes
  library(plotly) # interactive plotting
})

# READ INPUT FILES --------------------------------------------------------

covarfile<-"rabo_mfa_100k"
clstfile<-"rabo_mfa_100k"

# read covar file
covar_path <- paste0("./data/",covarfile, ".covar")
covar <- read.table(covar_path, stringsAsFactors = F);

# Read annot file
annot <- read.table(paste0("data/bamlist_mrg_",clstfile, "_clst"),stringsAsFactors = F, header = TRUE); 

# CREATE EIGEN MATRIX -----------------------------------------------------

# Eigenvalues
eig <- eigen(covar, symm=TRUE);
eig$val <- eig$val/sum(eig$val);

# SET UP PCA -------------------------------------------------------------

# get PC colnames and set label/col/pops
PC <- as.data.frame(eig$vectors)
colnames(PC) <- gsub("V", "PC", colnames(PC))
PC$Region <- factor(annot$FID)
PC$Pop <- factor(annot$CLUSTER)
PC$ID <- factor(annot$IID)

# SET UP PLOTS ------------------------------------------------------------

# set the xy vars:
x1 = 'PC1'
y1 = 'PC2'

# Create the titles for each PCA
title12 <- paste("PC1 (",signif(eig$val[1], digits=3)*100,"%)"," / PC2 (",signif(eig$val[2], digits=3)*100,"%)", sep="",collapse="")
title13 <- paste("PC1 (",signif(eig$val[1], digits=3)*100,"%)"," / PC3 (",signif(eig$val[3], digits=3)*100,"%)", sep="",collapse="")
title23 <- paste("PC2 (",signif(eig$val[2], digits=3)*100,"%)"," / PC3 (",signif(eig$val[3], digits=3)*100,"%)",sep="",collapse="")

# TEST PLOTLY PLOTS ------------------------------------------------------

# 1 v 2
(p1v2 <- ggplotly(p = ggplot(data=PC, aes(x=PC1, y=PC2, color=Pop, 
                                          label=paste(Region,"-",ID))) +
                    geom_point(size=4, alpha=0.8) + 
                    labs(title=paste0(covarfile, ": ",title12)) +
                    theme_bw() +
                    scale_color_viridis(discrete = TRUE, option = "D")))