The de novo step is not necessary if a model genome or previous alignment is available (skip to Reference Assembly)
Pick your best 5–10 individuals (using du -hs
or wc -l
)
Copy these individuals (just the RA
files) into a new directory
ls fastq | sed "s:.fastq::g" > list
Hash_reads.sh
, which will run all three perl scripts (see Option 2) Hash_reads.sh
#!/bin/bash -l
F1=$1 # input file
n=$(wc -l $F1 | awk '{print $1}')
x=1
while [ $x -le $n ]
do
string="sed -n ${x}p $F1"
str=$($string)
var=$(echo $str | awk -F"\t" '{print $1}')
set -- $var
c1=$1 ### Sample name ###
echo "#!/bin/bash" > ${c1}.sh
echo "" >> ${c1}.sh
# echo "perl QualityFilter.pl ${c1}.fastq > ${c1}_L80P80.fastq" >> ${c1}.sh
echo "perl HashSeqs.pl ${c1}_L80P80.fastq ${c1} > ${c1}_L80P80.hash" >> ${c1}.sh
echo "perl PrintHashHisto.pl ${c1}_L80P80.hash > ${c1}_L80P80.txt" >> ${c1}.sh
sbatch -c 1 ${c1}.sh
x=$(( $x + 1 ))
done
perl QualityFilter.pl Dpup_007_RA.fastq > Dpupt_oo7_RA_QF.fastq
QualityFilter.pl
#!/usr/bin/perl
########################################################################
$percent_filter = 80;
$length = 80;
$phred = 33; #The value should be either 33 or 64 depending on the base-calling pipeline.
########################################################################
$file = $ARGV[0];
open(FILE, "<$file")
or die;
while (<FILE>) {
$R1_ID1 = $_;
$R1_seq = <FILE>;
$R1_ID2 = <FILE>;
$R1_qual = <FILE>;
chomp($R1_seq);
chomp($R1_qual);
$R1_seq = substr($R1_seq,0,$length);
$R1_qual = substr($R1_qual,0,$length);
if ($R1_seq =~ m/N/) {
} else {
@ASCII = unpack("C*", $R1_qual);
$prob = 100;
$x = 1;
$R1_length = length($R1_qual);
foreach $value (@ASCII) {
$value = $value - $phred;
$prob = $prob * (1-(10**(-$value/10)));
$x++;
}
if ($prob >= $percent_filter) {
$good_reads++;
print $R1_ID1;
print "$R1_seq\n";
print $R1_ID2;
print "$R1_qual\n";
}
}
}
close FILE;
perl HashSeqs.pl Dpup_007_RA_QF.fastq Dpup_007 > Dpup_007_RA_QF.hash
HashSeqs.pl
#!/usr/bin/perl
$file = $ARGV[0];
$lib = $ARGV[1];
$min = 2;
open(FILE, "<$file")
or die;
while (<FILE>) {
$seq_line = <FILE>;
chomp($seq_line);
$tags{$seq_line}++;
$_ = <FILE>;
$_ = <FILE>;
}
close FILE;
$z = 1;
foreach $key (sort { $tags{$b} <=> $tags{$a} } keys %tags) {
if ($tags{$key} >= $min) {
print ">$lib;$z;$tags{$key}\n";
print "$key\n";
}
$z++;
}
perl PrintHashHisto.pl Dpup_007_RA_QF.hash
Before aligning, need to know parameters for alignment script. To count files in a dir:
ls -l *.hash | wc -l
Concatenate all the hash files cat *.hash > _.fasta
where the underscore can be anything you want (like a name).
Create the novoindex index file (can run with 32G if need more memory):
screen
See notes at start of this pipeline in case you are unfamiliarsrun -t 48:00:00 --mem=16G novoindex _.fa.idx _.fasta
This is what you want it called and whatever your fasta file was namedsmap -c | grep your_user_name
Now align index to itself, with following:
screen
See notes at start of this pipeline in case you are unfamiliarsrun -t 48:00:00 --mem=16G novoalign -r E 48 -t 180 -d _.fa.idx -f _.fasta > _.novo
du -hs * | sort -hr
screen
See notes at start of this pipeline in case you are unfamiliarsrun -t 48:00:00 --mem=16G IdentifyLoci3.pl _.novo > _IDloci.fasta
IdentifyLoci3.pl
#!/usr/bin/perl
#######################################################################################
$max_alignment_score = 30;
$divergence_factor = 2;
$min_count = 0; #zero turns off to use min_flag instead
$min_flag = 0; #zero turns off to use min_count instead
$max_internal_alignments = 1;
$min_internal_alignments = 0;
$max_external_alignments = 10; # twice the df of the total number of samples (i.e. individuals) in the alignment
$min_external_alignments = 2; # approximately half of the df of the total number of samples (i.e. individuals) in the alignment
$max_total_alignments = 11; # max_ext_align + max_int_align
$min_total_alignments = 2; # min_ext_align + min_int_align
$min_alleles = 1;
$max_alleles = 4;
$min_samples_per_allele = 2;
$max_samples_per_allele = 6; # total number of samples (i.e. individuals) in the alignment
#######################################################################################
$file = $ARGV[0];
open(FILE, "<$file") or die;
while (<FILE>) {
$line = $_;
chomp($line);
if (substr($line,0,1) eq ">") {
@tabs = split(/\t/,$line);
if ($tabs[9] eq "F") {
$query = $tabs[0];
($query_lib, $query_ID, $query_count, $query_flag) = split(/\;/, $query);
$query_sequence = $tabs[2];
if ($query_used{$query} != 1 && $query_count >= $min_count && $query_flag >= $min_flag) {
$query_sequences{$query} = $query_sequence;
$sample_count{$query_sequence}++;
$query_used{$query} = 1;
}
}
}
}
close(FILE);
%query_used = (); #clear this hash to clear memory
open(FILE, "<$file") or die;
while (<FILE>) {
$line = $_;
chomp($line);
if (substr($line,0,1) eq ">") {
@tabs = split(/\t/,$line);
if ($tabs[9] eq "F") {
$query = $tabs[0];
($query_lib, $query_ID, $query_count, $query_flag) = split(/\;/, $query);
$index = $tabs[7];
($index_lib, $index_ID, $index_count, $index_flag) = split(/\;/, $index);
$score = $tabs[5];
$change = $tabs[13];
$query_sequence = $tabs[2];
$index_sequence = $query_sequences{$index};
if ( ($query ne $index) && ( $sample_count{$query_sequence} >= $min_samples_per_allele && $sample_count{$query_sequence} <= $max_samples_per_allele ) && ( $sample_count{$index_sequence} >= $min_samples_per_allele && $sample_count{$index_sequence} <= $max_samples_per_allele ) ) {
if ($query_lib eq $index_lib) {
push(@{$internal_alignments{$query}}, $index);
push(@{$internal_alignment_scores{$query}}, $score);
} else {
if ($score == 0) {
push(@{$perfect_external_alignments{$query}}, $index);
push(@{$external_alignments{$query}}, $index);
push(@{$external_alignment_scores{$query}}, $score);
} else {
push(@{$external_alignments{$query}}, $index);
push(@{$external_alignment_scores{$query}}, $score);
}
}
}
}
}
}
close(FILE);
$locus = 1;
foreach $query (keys %query_sequences) {
if ($printed{$query} != 1) {
$bad = 0;
@alleles = valid_alignments($query);
$string1 = "@alleles";
if ($string1 ne "") {
foreach (@alleles) {
@alleles2 = valid_alignments($_);
$string2 = "@alleles2";
if ($string1 ne $string2) { $bad = 1; }
}
if ($bad != 1) {
$print_string = ""; $print_count = 0;
foreach (@alleles) {
if ($printed{$_} != 1) {
@matches = ($_, @{$perfect_external_alignments{$_}});
@matches = sort(@matches);
$ID = "";
foreach (@matches) {
$ID = $ID . substr($_,1,length($_)) . "|";
$printed{$_} = 1;
}
chop($ID);
$plocus = sprintf("%06d", $locus); $plocus = "R" . $plocus;
$print_string = $print_string . ">$plocus\{$ID\}\n";
$print_string = $print_string . $query_sequences{$_} . "\n";
$print_count++;
}
}
if ($print_count >= $min_alleles && $print_count <= $max_alleles) {
print $print_string;
$locus++;
}
}
}
}
}
sub valid_alignments {
$seq = $_[0];
@int_aligns = ();
@ext_aligns = ();
@aligns = ();
$flag = 0;
($seq_lib, $seq_ID, $blank, $seq_count) = split(/\;/, $seq);
$x = 0;
while ($x < scalar(@{$internal_alignments{$seq}})) {
if ($internal_alignment_scores{$seq}[$x] <= $max_alignment_score) {
push(@int_aligns, $internal_alignments{$seq}[$x]);
} elsif ($internal_alignment_scores{$seq}[$x] < $max_alignment_score*$divergence_factor) {
$flag = 1;
}
$x++;
}
if (scalar(@int_aligns) > $max_internal_alignments
|| scalar(@int_aligns) < $min_internal_alignments) { $flag = 1; }
$x = 0;
while ($x < scalar(@{$external_alignments{$seq}})) {
if ($external_alignment_scores{$seq}[$x] <= $max_alignment_score) {
push(@ext_aligns, $external_alignments{$seq}[$x]);
} elsif ($external_alignment_scores{$seq}[$x] < $max_alignment_score*$divergence_factor) {
$flag = 1;
}
$x++;
}
if (scalar(@ext_aligns) > $max_external_alignments
|| scalar(@ext_aligns) < $min_external_alignments) { $flag = 1; }
if (scalar(@int_aligns)+scalar(@ext_aligns) > $max_total_alignments
|| scalar(@int_aligns)+scalar(@ext_aligns) < $min_total_alignments) { $flag = 1; }
if ($flag != 1) {
@aligns = ($seq, @int_aligns, @ext_aligns);
@aligns = sort(@aligns);
return @aligns;
}
}
perl SimplifyLoci2.pl _IDloci.fasta | grep "_2" | wc -l
mkdir PRICE [in denovo folder]
mkdir extendLoci_1
(if 104,000 or less (aa:az)) 4000 loci x 26 letters (a-z) equals 104,000mkdir extendLoci_2
(if >104,000 loci) add another directory for each 104000 you will need.cp ~PATH/RecoverLocusSpecificReads.pl ./
(This must be in the PRICE directory)getLoci.py
RecoverLocusSpecificReads.sh
extendLoci.sh
format_contigs.sh
cat.sh
select_loci.py
(This must be in the PRICE directory)cat *_RA.fastq > RABO_R1.fastq
For example RABO is the species name and its all individuals sequencedcat *_RB.fastq > RABO_R2.fastq
cp RABO* ./PRICE
????? / 2 =
?????? LOCI using IDLoci2
perl ~/scripts/SimplifyLoci2.pl _IDloci.fasta | grep --no-group-separator -A 1 "_1" > _IDloci_s.fasta
simplified final
versionsed 's/_1//' _IDloci_s.fasta > _IDloci_sf.fasta
split -l 8000 _IDloci_sf.fasta _IDloci_sf.fasta'_'
This should give you the same file name plus aa,ab,ac,etc at the end of the file name_IDloci_sf.fasta_aa
, _IDloci_sf.fasta_ab
)
ls *.fasta_a* > data_list
tail _IDloci_sf.fasta_a?
and the last number is the number of loci you have (R??????
) OR
Loci_aa
file which is a list of R000001 to whatever locus number you have from #8 (up to R104000) and (if necessary) a Loci_bb
which is R104001 to whatever locus number you have. You can use the Rscript (makelist.R
) or use a text editor.
makelist.R
numbs<-seq(1:108279)
Again this is assuming you have 108279 loci rabo<-sprintf("R%06d", numbs)
write.table(rabo, quote=FALSE, col.names = FALSE, row.names = F)
Rscript makelist.R
Should provide Loci_aa and Loci_bb filesextendLoci_*
directory you created
RecoverLocusSpecificReads.sh
extendLoci.sh
format_contigs.sh
cat.sh
getLoci.py
RecoverLocusSpecificReads.sh
to reflect your ../data_list and two fastq files (../_R1.fastq and ../_R2.fastq). Also adjust $x -le ?
to reflect the number of files in your data_list (so if you have aa->ag, ? = 7).
extendLoci.sh
and adjust as needed:
extendLoci_1.sh
format_contigs.sh
and adjust as needed:
format_contigs.sh
Run scripts in each extendLoci directory IN ORDER from within that directory:
RecoverLocusSpecificReads.sh
in extend_Loci_?
directory
sbatch RecoverLocusSpecificReads.sh with Loci list (data_list)
to create sh files in PRICEsbatch extendLoci.sh ../Loci_aa
sbatch format_contigs.sh ../Loci_aa
sbatch cat.sh aa
mv aa_contigs.fasta ../
Be careful not to “ls” the extendLoci_*
directories as they are very full and may stall your command line
Cat Loci_aa and Loci_bb together, if necessary: cat Loci_??_contigs.fasta > final_contigs.fasta
Then select loci with 300 bp or more: python select_loci.py final_contigs.fasta 300
wc -l final_contigs