PCA is a great way to explore your sequencing data, assess structure, patterns, or discover plate inversions. Yay. Generally it reduces the dimensionality of data and provides a way to view that data along two axes.
The newest angsd
version (0.918+) provides a way to do PCA without subsampling your data. This creates a covariance matrix bt sampling a single read at each polymorphic site. The basic angsd call for this is as follows:
~/bin/angsd -bam ${bamlist} -out ${out}.PCAMDS1 -doIBS 1 -doCounts 1 -doMajorMinor 1 -minFreq 0.05 -maxMis ${nind} -minMapQ 30 -minQ 20 -SNP_pval 0.000001 -makeMatrix 1 -doCov 1 -GL 1 -doMaf 2 -doPost 2 -nThreads 16
Notes:
doIBS
give extra time/memory. This method can take a long time because it uses all sites in the data.ls
all bamfiles that have a certain subsampling threshold (say 100k), and then sed
out the 100k part of the filename, so you match the the original bam file name, but only keep bams that have read counts 100k or greater. This is a good way to use all the data, but only for samples that have great than a certain number of reads.ngsCovar
PCA with subsamplingThis method uses the other genotype calling option and ngsCovar
.
PCA_CALC.sh:
#!/bin/bash -l
#SBATCH -o slurm_outs/03_pca_calc-%j.out
#SBATCH -J pca_calc_sites
echo "Starting Job: "
date
bamlist=$1
out=$2
bamdir=$3
nInd=$(wc ${bamdir}/${bamlist} | awk '{print $1}')
minInd=$[$nInd/2]
outdir='angsd_output'
angsd -bam ${bamdir}/${bamlist} -out ${outdir}/${out} -minQ 20 -minMapQ 10 -minInd $minInd -GL 1 -doMajorMinor 1 -doMaf 2 -doPost 2 -doGeno 32 -minMaf 0.05 -postCutoff 0.95 -SNP_pval 1e-6 -sites bait_lengths.txt
gunzip ${outdir}/${out}.*.gz
count=$(sed 1d ${outdir}/${out}.mafs | wc -l | awk '{print $1}')
ngsCovar -probfile ${outdir}/${out}.geno -outfile ${outdir}/${out}.covar -nind $nInd -nsites $count -call 1
echo "Ending Job: "
date
To plot, you’ll need to assign IDs using a clst
file. I use the following steps (assuming we are using the 100k
subsampled bamlist):
sed
to create a clst file of the samples/bamnames (i.e., sed 's/_RA\.sort\.flt_100000\.bam/ 1 1/g' subbamlist > out.clst
)out.clst
and insert a header FID IID CLUSTER
:
pca_plot.sh
to create 3 pdfs (pc1 vs pc2, pc1 v pc3, pc2 vs. pc3). It can be modified.PCA_PLOT
#!/bin/bash -l
#SBATCH -o slurm_outs/04_pca_pdf-%j.out
#OUTDIR=pca_pdfs
#SBATCH -J PCA_plots
echo "Starting Job: "
date
out=$1
bamclst=$2
outdir='pca_pdfs'
homedir='/home/fastq'
Rscript --vanilla --slave ${homedir}/pca_makeplot.R -i ./angsd_output/${out}.covar -c 1-2 -a ${homedir}/${bamclst} -o ${outdir}/${out}_12_pca.pdf
Rscript --vanilla --slave ${homedir}/pca_makeplot.R -i ./angsd_output/${out}.covar -c 1-3 -a ${homedir}/${bamclst} -o ${outdir}/${out}_13_pca.pdf
Rscript --vanilla --slave ${homedir}/pca_makeplot.R -i ./angsd_output/${out}.covar -c 2-3 -a ${homedir}/${bamclst} -o ${outdir}/${out}_23_pca.pdf
echo "Ending Job: "
date
Rscript to Plot PCA
#!/usr/bin/Rscript
# Usage: Rscript -i infile.covar -c component1-component2 -a annotation.file -o outfile.eps
#.libPaths("~/R/x86_64-pc-linux-gnu-library/3.2")
library(optparse)
library(ggplot2)
library(stringi)
library(viridis)
#if(!require(viridis)) {
# install.packages("viridis"); require(viridis)} #load / install+load installr
<- list(make_option(c('-i','--in_file'), action='store', type='character', default=NULL, help='Input file (output from ngsCovar)'),
option_list make_option(c('-c','--comp'), action='store', type='character', default=1-2, help='Components to plot'),
make_option(c('-a','--annot_file'), action='store', type='character', default=NULL, help='Annotation file with individual classification (2 column TSV with ID and ANNOTATION)'),
make_option(c('-o','--out_file'), action='store', type='character', default=NULL, help='Output file')
)<- parse_args(OptionParser(option_list = option_list))
opt
# Annotation file is in plink cluster format
#################################################################################
# Read input file
<- read.table(opt$in_file, stringsAsFact=F);
covar
# Read annot file
<- read.table(opt$annot_file, sep=" ", header=T); # note that plink cluster files are usually tab-separated instead
annot
# Parse components to analyze
<- as.numeric(strsplit(opt$comp, "-", fixed=TRUE)[[1]])
comp
# Eigenvalues
<- eigen(covar, symm=TRUE);
eig $val <- eig$val/sum(eig$val);
eigcat(signif(eig$val, digits=3)*100,"\n");
# Write eigenvalues
#write.table(eig, file = "eigen_scores.txt", quote = FALSE)
# Plot
<- as.data.frame(eig$vectors)
PC colnames(PC) <- gsub("V", "PC", colnames(PC))
$River <- factor(strtrim(annot$FID,5)) # adjust/add a column of interest here from metadata
PC$Pop <- factor(annot$CLUSTER)
PC$ID <- factor(annot$IID)
PC$IDname <- factor(stringr::str_extract(annot$FID, pattern = "(?<=_)([A-Z]{2})")) # change to match your naming convention
PC
# Write PC components
#write.table(PC, file = "PC_scores.txt", quote = FALSE)
<- paste("PC",comp[1]," (",signif(eig$val[comp[1]], digits=3)*100,"%)"," / PC",comp[2]," (",signif(eig$val[comp[2]], digits=3)*100,"%)",sep="",collapse="")
title
= paste("PC",comp[1],sep="")
x_axis = paste("PC",comp[2],sep="")
y_axis
ggplot() + geom_point(data=PC, aes_string(x=x_axis, y=y_axis, color="IDname", shape="River"), size=5) +
geom_text(data=PC, aes_string(x=x_axis, y=y_axis, label="IDname", vjust= -0.784), check_overlap=TRUE, cex=2.5, hjust=-0.3) +
ggtitle(title) + theme_bw() + scale_color_viridis(discrete=TRUE)
ggsave(opt$out_file)
unlink("Rplots.pdf", force=TRUE)
For data exploration, you can use the following script to make an interactive or dynamic plot, which is useful for investigating your data. You’ll need to grab the .covar
file, and have the .clst
file from your original call.
# LIBRARIES ---------------------------------------------------------------
suppressPackageStartupMessages({
library(tidyverse); # for ggplot2 and dplyr
library(viridis); # for color palettes
library(plotly) # interactive plotting
})
# READ INPUT FILES --------------------------------------------------------
<-"rabo_mfa_100k"
covarfile<-"rabo_mfa_100k"
clstfile
# read covar file
<- paste0("./data/",covarfile, ".covar")
covar_path <- read.table(covar_path, stringsAsFactors = F);
covar
# Read annot file
<- read.table(paste0("data/bamlist_mrg_",clstfile, "_clst"),stringsAsFactors = F, header = TRUE);
annot
# CREATE EIGEN MATRIX -----------------------------------------------------
# Eigenvalues
<- eigen(covar, symm=TRUE);
eig $val <- eig$val/sum(eig$val);
eig
# SET UP PCA -------------------------------------------------------------
# get PC colnames and set label/col/pops
<- as.data.frame(eig$vectors)
PC colnames(PC) <- gsub("V", "PC", colnames(PC))
$Region <- factor(annot$FID)
PC$Pop <- factor(annot$CLUSTER)
PC$ID <- factor(annot$IID)
PC
# SET UP PLOTS ------------------------------------------------------------
# set the xy vars:
= 'PC1'
x1 = 'PC2'
y1
# Create the titles for each PCA
<- paste("PC1 (",signif(eig$val[1], digits=3)*100,"%)"," / PC2 (",signif(eig$val[2], digits=3)*100,"%)", sep="",collapse="")
title12 <- paste("PC1 (",signif(eig$val[1], digits=3)*100,"%)"," / PC3 (",signif(eig$val[3], digits=3)*100,"%)", sep="",collapse="")
title13 <- paste("PC2 (",signif(eig$val[2], digits=3)*100,"%)"," / PC3 (",signif(eig$val[3], digits=3)*100,"%)",sep="",collapse="")
title23
# TEST PLOTLY PLOTS ------------------------------------------------------
# 1 v 2
<- ggplotly(p = ggplot(data=PC, aes(x=PC1, y=PC2, color=Pop,
(p1v2 label=paste(Region,"-",ID))) +
geom_point(size=4, alpha=0.8) +
labs(title=paste0(covarfile, ": ",title12)) +
theme_bw() +
scale_color_viridis(discrete = TRUE, option = "D")))