OS: UNIX/LINIX
SOFTWARE PACKAGES: git, wget, tar, unzip, autoconf, gcc(v4.8.5+), java(v1.8.0+), perl, python
SOFTWARE PACKAGES: bwa, samtools, bedtools, fastp, samblaster, bcftools, htslib, vcftools, st
JAVA JARS: gatk, mutserve, haplogrep, haplocheck
HUMAN ASSEMBLY: hs38DH
$ git clone https://github.com/dpuiu/HP.git
$ PWD=`pwd`
$ export HP_SDIR=path_to_pipeline_home_directory/HP/scripts/ # set script directory variable
$ sudo $HP_SDIR/install_sysprerequisites.sh
$ $HP_SDIR/install_prerequisites.sh
# if successfull => "Success message!"
$ $HP_SDIR/checkInstall.sh
$ cat checkInstall.log
# go to the working directory ; copy init.sh
$ cp -i $HP_SDIR/init.sh .
# set variables (in terminal or init.sh)
$ nano init.sh
...
export HP_ADIR=$PWD/bams/ # alignment dir
export HP_ODIR=$PWD/out/ ; # output dir
export HP_IN=$PWD/in.txt # input file
$ . ./init.sh
$ mkdir -p $HP_ODIR
$ find $HP_ADIR/ -type f -name "*.bam" -o -name "*.cram" | \
$HP_SDIR/ls2in.pl -out $HP_ODIR | sort > $HP_IN
$ cut -f2 $HP_IN | sed "s|^|$HP_SDIR/samtools.sh |" > samtools.all.sh
$ . ./samtools.all.sh
$ cut -f2 $HP_IN | sed -r 's|(.*)\.|\1\t|g' | cut -f1 | sed 's|$|.count|' | xargs cat | \
$HP_SDIR/uniq.pl | $HP_SDIR/getCN.pl > $HP_ODIR/count.tab
$ $HP_SDIR/run.sh > filter.all.sh
$ . ./filter.all.sh
under $ODIR:
VCF/TAB/SUMMARY/FASTA Files:
{mutect2,mutserve}.{03,05,10}.{concat,merge[.sitesOnly]}.vcf # SNVs; 3,5,10% heteroplasmy thold
{mutect2,mutserve}.{03,05,10}.tab # SNV counts
{mutect2,mutserve}.{03,05,10}.summary # SNV count summaries
{mutect2,mutserve}.fa # consensus sequence
{mutect2,mutserve}.haplogroup[1].tab # haplogroup
{mutect2,mutserve}.haplocheck.tab # contamination screen
count.tab # reads & mtDNA-CN counts
cvg.tab # coverage stats
mutect2.mutect2.{03,05,10}.{concat,merge[.sitesOnly]}.vcf
mutect2.mutect2.{03,05,10}.tab
removed the NUMT labeled SNV's
$ $HP_SDIR/run.sh
$ head $HP_IN
#sampleName inputFile outputPath/prefix
chrM.A bams/chrM.A.bam out/chrM.A/chrM.A
chrM.B bams/chrM.B.bam out/chrM.B/chrM.B
chrM.C bams/chrM.C.bam out/chrM.C/chrM.C
...
$ head $HP_ODIR/count.tab
Run all mapped chrM M
chrM.A 851537886 848029490 396766 181.7
chrM.B 884383716 882213718 506597 223.01
chrM.C 786560467 785208588 503241 248.9
...
$ ls $HP_ADIR/*
bams/chrM.A.bam
bams/chrM.A.bam.bai
bams/chrM.A.idxstats
bams/chrM.A.count
...
$ ls out/
chrM.A/
chrM.B/
chrM.C/
...
$ cat mutect2.03.concat.vcf
...
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
chrM 64 . C T . clustered_events;haplotype SM=chrM.A;HV GT:DP:AF 0|1:67:1
chrM 73 . A G . PASS SM=chrM.B;HV;NUMT GT:DP:AF 0/1:52:1
chrM 73 . A G . PASS SM=chrM.C;HV GT:DP:AF 0/1:43:1
...
$ cat mutect2.03.merge.vcf
...
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT chrM.A chrM.B chrM.C
chrM 64 . C T . . AC=1;AN=2;HV GT:DP:AF 0|1:67:1 .:.:. .:.:.
chrM 73 . A G . . AC=1;AN=2;HV;NUMT GT:DP:AF .:.:. 0/1:52:1 .:.:.
chrM 73 . A G . . AC=2;AN=4;HV GT:DP:AF 0|1:70:1 .:.:. 0/1:43:1
...
$ cat mutect2.03.merge.sitesOnly.vcf
#CHROM POS ID REF ALT QUAL FILTER INFO
chrM 64 . C T . . AC=1;AN=2;HV
chrM 73 . A G . . AC=1;AN=2;HV;NUMT
chrM 73 . A G . . AC=2;AN=4;HV
....
$ cat mutect2.03.tab
Run H h S s I i Hp hp Sp sp Ip ip
chrM.A 31 44 28 36 3 8 3 0 0 0 3 0
chrM.B 27 42 25 34 2 8 5 0 3 0 2 0
chrM.C 39 43 35 35 4 8 2 0 0 0 2 0
$ cat mutect2.haplogroup.tab
Run haplogroup
chrM.A A2+(64)
chrM.B B2
chrM.C C
Run # Sample name
all # number of reads in the sample reads
mapped # number of aligned reads in the sample
chrM # number of reads aligned to chrM
filter # number of chrM used (subsample ~2000x cvg based on name)
Gcvg # recomputed genome coverage: Bases/3217346917
Mcvg # mitochondrion covearge: chrM*rdLen/16569
M # mtDNA copy number ; = 2*Mcvg/Gcvg
haplogroup # sample haplogroup
S # homozygous SNPs, 3% heteroplasmy rate
s # heterozygous SNPs
I # homozygous INDELs
i # heterozygous INDELs
...
?p # "p" suffix stands for non homopolimeric
$ nano init.sh
export HP_FNAME="no_qual_haplotype_strand"
export HP_FRULE="egrep -v \"qual|haplotype|strand\""
# rerun run.sh ...
$ HP_M=mutect2
$ cat $HP_ODIR/$M.00.concat.vcf | egrep -v 'qual|haplotype|strand' > $HP_ODIR/$M.no_qual_haplotype_strand.00.concat.vcf
$ $HP_SDIR/snpCount.sh $HP_IN $ODIR $HP_M.no_qual_haplotype_strand 03
$ $HP_SDIR/snpCount.sh $HP_IN $ODIR $HP_M.no_qual_haplotype_strand 05
$ $HP_SDIR/snpCount.sh $HP_IN $ODIR $HP_M.no_qual_haplotype_strand 10