Difference between revisions of "Sequence Analysis Practice 2011/03/09"

From Genome Analysis Wiki
Jump to navigationJump to search
 
(23 intermediate revisions by one other user not shown)
Line 3: Line 3:
 
Below lists a sequence of practice mapping fastq files to bam files, performing variant calling and variout quality checks.
 
Below lists a sequence of practice mapping fastq files to bam files, performing variant calling and variout quality checks.
  
== Steps ==
+
== Steps (Wednesday) ==
 +
 
 +
[[Media:Hmkang-20110309-practical.pdf | Low-level Processing - Practical Slides (PDF)]]
  
 
0. SETTING UP ENVIRONMENTAL VARIABLES
 
0. SETTING UP ENVIRONMENTAL VARIABLES
Line 11: Line 13:
 
  setenv REF /home/hyun/wed/ref
 
  setenv REF /home/hyun/wed/ref
 
   
 
   
  setenv OUT ~/seq/wednesday/output
+
  setenv OUT ~/seq/wednesday/output  
 
  mkdir --p ${OUT}
 
  mkdir --p ${OUT}
  
or run
+
1. Understanding FASTQ format
 +
 
 +
zcat ${IN}/NA12878.exon.sample.read1.fastq.gz | head
 +
zcat ${IN}/NA12878.exon.sample.read2.fastq.gz | head
 +
zcat ${IN}/NA12878.exon.sample.unpaired.fastq.gz | head
  
csh /home/hyun/wed/bin/sh/step0.sh
+
press q to quit
  
1. Align using BWA
+
2. Align using BWA
  
  ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai
+
  ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.read1.fastq.gz > ${OUT}/NA12878.exon.sample.read1.fastq.gz.sai
 
   
 
   
  ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai
+
  ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.read2.fastq.gz > ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai
 
+
  ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai
+
  ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.unpaired.fastq.gz > ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai
 
 
${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted
 
 
 
${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa ${OUT}//NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted
 
 
 
2. MERGE ALIGNED BAMS INTO A SINGLE BAM
 
  
  ${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted.bam
+
  ${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai ${IN}/NA12878.exon.sample.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted
 +
 +
${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa  ${OUT}//NA12878.exon.sample.read1.fastq.gz.sai ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai  ${IN}/NA12878.exon.sample.read1.fastq.gz ${IN}/NA12878.exon.sample.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.paired.bwa.sorted
  
3. BRIEF SUMMARY OF THE BAM
+
3. Merge multiple BAMs into one
 +
${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon.sample.merged.bam ${OUT}/NA12878.exon.sample.paired.bwa.sorted.bam  ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted.bam
  
${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam
 
  
4. MARK DUPLICATED READS
+
4. View SAM/BAM format
 +
${BIN}/samtools-hybrid view -h ${OUT}/NA12878.exon.sample.merged.bam | less
  
  ${BIN}//superDeDuper -i ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam -o ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam -v
+
5. Mark Duplicate Reads
 +
  ${BIN}/superDeDuper -i ${OUT}/NA12878.exon.sample.merged.bam -o ${OUT}/NA12878.exon.sample.deduped.bam -v
  
5. BRIEF SUMMARY OF THE BAM
+
6. Visualize alignment to reference genome
 +
${BIN}/samtools-hybrid index ${OUT}/NA12878.exon.sample.deduped.bam
 +
${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam ${REF}/human_g1k_v37_chr20.fa
  
${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam
+
* Type 'g', and 20:19989392
 +
* TYPE 'g', and 20:20032998
  
6. VIEW THE ALIGNMENT
+
7. Use QPLOT to assess the quality
  
  ${BIN}/samtools-hybrid view  ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam | less
+
  ${BIN}/qplot --plot ${OUT}/NA12878.exon.sample.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon.sample.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent  ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon.sample.deduped.bam
  
TYPE 'q' to finish
+
== Steps (Thursday) ==
  
7. INDEX THE ALIGNMENT FOR RANDOM ACCESS OF THE BAM
+
0. SETTING UP ENVIRONMENTAL VARIABLES
  
  ${BIN}/samtools-hybrid index  ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam
+
  setenv BIN /home/hyun/wed/bin
 +
setenv IN /home/hyun/wed/input
 +
setenv REF /home/hyun/wed/ref
 +
 +
setenv OUT ~/seq/wednesday/output
 +
mkdir --p ${OUT}
  
8. GENOMIC VIEW OF THE ALIGNMENT
+
1. EXON-TARGETTED DATA : COMPUTING GENOTYPE LIKELHOOD FROM BAM FILES
  
  ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa
+
  ${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.deduped.bam > ${OUT}/NA12878.exon.sample.glf
  
TYPE 'g' and 20:20000000 (7 consecutive zeros)
+
2. EXON-TARGETTED DATA : VIEW THE GENOTYPE LIKELIHOOD FORMAT
 
 
 
 
9. QUALITY CHECKING USING QPLOT
 
 
 
${BIN}/qplot --plot ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent  ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam
 
 
   
 
   
  cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats
+
  ${BIN}/samtools-hybrid glfview ${OUT}/NA12878.exon.sample.glf | less
  
The PDF file can be viewed  ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf
+
TYPE 'q' to finish
  
9. COMPUTE THE GENOTYPE LIKELIHOOD
+
3. EXON-TARGETTED DATA : SINGLE-SAMPLE GENOTYPE CALLING using GLFSINGLE
  
  ${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf
+
  ${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.exon.sample.glf -b ${OUT}/NA12878.exon.sample.vcf
 
${BIN}/samtools-hybrid glfview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf | less
 
  
TYPE 'q' to finish
+
4. EXON-TARGETTED DATA : VIEW THE VCF FILES AND COUNT # OF SNPS
  
10. SINGLE-SAMPLE GENOTYPE CALLING using GLFSINGLE
+
less ${OUT}/NA12878.exon.sample.vcf
  
  ${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf -b ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf
+
  grep -v ^# ${OUT}/NA12878.exon.sample.vcf | wc -l
 
less ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf
 
  
TYPE 'q' to finish
+
5. DEEP-COVERAGE GENOME : COMPUTE THE GENOTYPE LIKELIHOOD
  
11. COMPUTE THE GENOTYPE LIKELIHOOD OF HIGH-COVERAGE DATA
+
${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.highcov.sample.bam > ${OUT}/NA12878.highcov.sample.glf
  
${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.bam > ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.glf
+
6. DEEP-COVERAGE GENOME : SINGLE-SAMPLE VARIANT CALLING
  
12. SINGLE-SAMPLE GENOTYPE CALLING on the HIGH COVERAGE DATA
+
${BIN}/glfSingle --maxDepth 10000  --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.highcov.sample.glf -b ${OUT}/NA12878.highcov.sample.vcf
  
${BIN}/glfSingle --maxDepth 10000  --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.glf -b ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf
+
7. DEEP-COVERAGE GENOME : VIEW THE VCF FILES AND COUNT # OF SNPS
 
   
 
   
  less ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf
+
  less ${OUT}/NA12878.highcov.sample.vcf
 +
 
 +
8. VIEW THE VCF FILES AND COUNT # OF SNPS
  
TYPE 'q' to finish
+
grep -v ^# ${OUT}/NA12878.highcov.sample.vcf | wc -l
  
13. COUNT NUMBER OF SNPs and OVERLAPS
+
9. EVALUATE OVERLAP BETWEEN THE TWO SETS OF VARIANT CALLS
  
  grep -v ^# ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf | wc -l  
+
  cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | wc -l
 
   
 
   
grep -v ^# ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | wc -l
+
  cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d  
 
+
 
  cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | wc -l
+
10. VIEW ACTUAL ALIGNMENT AT SNP POSITIONS
 
cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d
 
 
   
 
   
  ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa
+
  ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam ${REF}/human_g1k_v37_chr20.fa
 
   
 
   
TYPE g , and  20:19989392
+
TYPE g, and  20:19989392
 
TYPE g, and  20:20032998
 
TYPE g, and  20:20032998
 +
TYPE g, and  20:20139952
  
  ${BIN}/samtools-hybrid tview ${IN}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.bam ${REF}/human_g1k_v37_chr20.fa
+
  ${BIN}/samtools-hybrid tview ${IN}/NA12878.highcov.sample.bam ${REF}/human_g1k_v37_chr20.fa
  
TYPE g , and  20:19989392
+
TYPE g, and  20:19989392
 
TYPE g, and  20:20032998
 
TYPE g, and  20:20032998
 +
TYPE g, and  20:20139952
  
14. SUMMARIZE STATISTICS
+
11. SUMMARIZE VCF STATISTICS
  
  perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20
+
  perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.exon.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20
 
   
 
   
  perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20
+
  perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.highcov.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20
 +
 
 +
== Filtering Examples in multisample calling ==
 +
* [[Media:Filtering example unfiltered.pdf | (BEFORE FILTERING)]]
 +
* [[Media:Filtering example.pdf | (AFTER FILTERING)]]
 +
 
 +
 
 +
== Where can you download this software? ==
 +
samtools-hybrid, glfSingle, superDeDuper, qplot can be downloaded at:
 +
https://github.com/statgen/statgen

Latest revision as of 13:29, 10 March 2011

Overview

Below lists a sequence of practice mapping fastq files to bam files, performing variant calling and variout quality checks.

Steps (Wednesday)

Low-level Processing - Practical Slides (PDF)

0. SETTING UP ENVIRONMENTAL VARIABLES

setenv BIN /home/hyun/wed/bin
setenv IN /home/hyun/wed/input
setenv REF /home/hyun/wed/ref

setenv OUT ~/seq/wednesday/output 
mkdir --p ${OUT}

1. Understanding FASTQ format

zcat ${IN}/NA12878.exon.sample.read1.fastq.gz | head
zcat ${IN}/NA12878.exon.sample.read2.fastq.gz | head
zcat ${IN}/NA12878.exon.sample.unpaired.fastq.gz | head

press q to quit

2. Align using BWA

${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.read1.fastq.gz > ${OUT}/NA12878.exon.sample.read1.fastq.gz.sai

${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.read2.fastq.gz > ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai

${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.unpaired.fastq.gz > ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai
${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai ${IN}/NA12878.exon.sample.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted

${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa  ${OUT}//NA12878.exon.sample.read1.fastq.gz.sai ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai  ${IN}/NA12878.exon.sample.read1.fastq.gz ${IN}/NA12878.exon.sample.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.paired.bwa.sorted

3. Merge multiple BAMs into one

${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon.sample.merged.bam ${OUT}/NA12878.exon.sample.paired.bwa.sorted.bam  ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted.bam


4. View SAM/BAM format

${BIN}/samtools-hybrid view -h ${OUT}/NA12878.exon.sample.merged.bam | less

5. Mark Duplicate Reads

${BIN}/superDeDuper -i ${OUT}/NA12878.exon.sample.merged.bam -o ${OUT}/NA12878.exon.sample.deduped.bam -v

6. Visualize alignment to reference genome

${BIN}/samtools-hybrid index ${OUT}/NA12878.exon.sample.deduped.bam
${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam ${REF}/human_g1k_v37_chr20.fa
  • Type 'g', and 20:19989392
  • TYPE 'g', and 20:20032998

7. Use QPLOT to assess the quality

${BIN}/qplot --plot ${OUT}/NA12878.exon.sample.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon.sample.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent  ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon.sample.deduped.bam

Steps (Thursday)

0. SETTING UP ENVIRONMENTAL VARIABLES

setenv BIN /home/hyun/wed/bin
setenv IN /home/hyun/wed/input
setenv REF /home/hyun/wed/ref

setenv OUT ~/seq/wednesday/output
mkdir --p ${OUT}

1. EXON-TARGETTED DATA : COMPUTING GENOTYPE LIKELHOOD FROM BAM FILES

${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.deduped.bam > ${OUT}/NA12878.exon.sample.glf

2. EXON-TARGETTED DATA : VIEW THE GENOTYPE LIKELIHOOD FORMAT

${BIN}/samtools-hybrid glfview ${OUT}/NA12878.exon.sample.glf | less

TYPE 'q' to finish

3. EXON-TARGETTED DATA : SINGLE-SAMPLE GENOTYPE CALLING using GLFSINGLE

${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.exon.sample.glf -b ${OUT}/NA12878.exon.sample.vcf

4. EXON-TARGETTED DATA : VIEW THE VCF FILES AND COUNT # OF SNPS

less ${OUT}/NA12878.exon.sample.vcf
grep -v ^# ${OUT}/NA12878.exon.sample.vcf | wc -l 

5. DEEP-COVERAGE GENOME : COMPUTE THE GENOTYPE LIKELIHOOD

${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.highcov.sample.bam > ${OUT}/NA12878.highcov.sample.glf

6. DEEP-COVERAGE GENOME : SINGLE-SAMPLE VARIANT CALLING

${BIN}/glfSingle --maxDepth 10000  --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.highcov.sample.glf -b ${OUT}/NA12878.highcov.sample.vcf

7. DEEP-COVERAGE GENOME : VIEW THE VCF FILES AND COUNT # OF SNPS

less ${OUT}/NA12878.highcov.sample.vcf

8. VIEW THE VCF FILES AND COUNT # OF SNPS

grep -v ^# ${OUT}/NA12878.highcov.sample.vcf | wc -l 

9. EVALUATE OVERLAP BETWEEN THE TWO SETS OF VARIANT CALLS

cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | wc -l

cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d 

10. VIEW ACTUAL ALIGNMENT AT SNP POSITIONS

${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam ${REF}/human_g1k_v37_chr20.fa

TYPE g, and 20:19989392 TYPE g, and 20:20032998 TYPE g, and 20:20139952

${BIN}/samtools-hybrid tview ${IN}/NA12878.highcov.sample.bam ${REF}/human_g1k_v37_chr20.fa

TYPE g, and 20:19989392 TYPE g, and 20:20032998 TYPE g, and 20:20139952

11. SUMMARIZE VCF STATISTICS

perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.exon.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20

perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.highcov.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20

Filtering Examples in multisample calling


Where can you download this software?

samtools-hybrid, glfSingle, superDeDuper, qplot can be downloaded at: https://github.com/statgen/statgen