Editing Sequence Analysis Practice 2011/03/10
Overview
The aim for today's practice is to perform variant calling from sequence alignment files
Steps
0. SETTING UP ENVIRONMENTAL VARIABLES
setenv BIN /home/hyun/thu/bin setenv IN /home/hyun/thu/input setenv REF /home/hyun/thu/ref setenv OUT ~/seq/thursday/output mkdir --p ${OUT}
1. EXON-TARGETTED DATA : COMPUTING GENOTYPE LIKELHOOD FROM BAM FILES
${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.deduped.bam > ${OUT}/NA12878.exon.sample.glf
2. EXON-TARGETTED DATA : VIEW THE GENOTYPE LIKELIHOOD FORMAT
${BIN}/samtools-hybrid glfview ${OUT}/NA12878.exon.sample.glf | less
TYPE 'q' to finish
3. EXON-TARGETTED DATA : SINGLE-SAMPLE GENOTYPE CALLING using GLFSINGLE
${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.exon.sample.glf -b ${OUT}/NA12878.exon.sample.vcf
4. EXON-TARGETTED DATA : VIEW THE VCF FILES AND COUNT # OF SNPS
less ${OUT}/NA12878.exon.sample.vcf
grep -v ^# ${OUT}/NA12878.exon.sample.vcf | wc -l
5. DEEP-COVERAGE GENOME : COMPUTE THE GENOTYPE LIKELIHOOD
${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.highcov.sample.bam > ${OUT}/NA12878.highcov.sample.glf
6. DEEP-COVERAGE GENOME : SINGLE-SAMPLE VARIANT CALLING
${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.highcov.sample.glf -b ${OUT}/NA12878.highcov.sample.vcf
7. DEEP-COVERAGE GENOME : VIEW THE VCF FILES AND COUNT # OF SNPS
less ${OUT}/NA12878.highcov.sample.vcf
8. VIEW THE VCF FILES AND COUNT # OF SNPS
grep -v ^# ${OUT}/NA12878.highcov.sample.vcf | wc -l
9. EVALUATE OVERLAP BETWEEN THE TWO SETS OF VARIANT CALLS
cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | wc -l cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d
10. VIEW ACTUAL ALIGNMENT AT SNP POSITIONS
${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam ${REF}/human_g1k_v37_chr20.fa
TYPE g, and 20:19989392 TYPE g, and 20:20032998 TYPE g, and 20:20139952
${BIN}/samtools-hybrid tview ${IN}/NA12878.highcov.sample.bam ${REF}/human_g1k_v37_chr20.fa
TYPE g, and 20:19989392 TYPE g, and 20:20032998 TYPE g, and 20:20139952
11. SUMMARIZE VCF STATISTICS
perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.exon.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20 perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.highcov.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20