Difference between revisions of "Sequence Analysis Practice 2011/03/09"
(14 intermediate revisions by one other user not shown) | |||
Line 3: | Line 3: | ||
Below lists a sequence of practice mapping fastq files to bam files, performing variant calling and variout quality checks. | Below lists a sequence of practice mapping fastq files to bam files, performing variant calling and variout quality checks. | ||
− | == Steps == | + | == Steps (Wednesday) == |
+ | |||
+ | [[Media:Hmkang-20110309-practical.pdf | Low-level Processing - Practical Slides (PDF)]] | ||
0. SETTING UP ENVIRONMENTAL VARIABLES | 0. SETTING UP ENVIRONMENTAL VARIABLES | ||
Line 16: | Line 18: | ||
1. Understanding FASTQ format | 1. Understanding FASTQ format | ||
− | zcat ${IN}/NA12878.exon.sample.read1.fastq.gz | head | + | zcat ${IN}/NA12878.exon.sample.read1.fastq.gz | head |
− | zcat ${IN}/NA12878.exon.sample.read2.fastq.gz | head | + | zcat ${IN}/NA12878.exon.sample.read2.fastq.gz | head |
− | zcat ${IN}/NA12878.exon.sample.unpaired.fastq.gz | head | + | zcat ${IN}/NA12878.exon.sample.unpaired.fastq.gz | head |
press q to quit | press q to quit | ||
Line 32: | Line 34: | ||
${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai ${IN}/NA12878.exon.sample.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted | ${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai ${IN}/NA12878.exon.sample.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted | ||
− | ${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa ${OUT}//NA12878.exon.sample.read1.fastq.gz.sai ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai ${IN}/NA12878.exon.sample.read1.fastq.gz ${IN}/NA12878.exon.sample.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.paired.bwa.sorted | + | ${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa ${OUT}//NA12878.exon.sample.read1.fastq.gz.sai ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai ${IN}/NA12878.exon.sample.read1.fastq.gz ${IN}/NA12878.exon.sample.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.paired.bwa.sorted |
3. Merge multiple BAMs into one | 3. Merge multiple BAMs into one | ||
Line 39: | Line 41: | ||
4. View SAM/BAM format | 4. View SAM/BAM format | ||
− | ${BIN}/samtools-hybrid view -h ${OUT}/NA12878.exon.sample.merged.bam | | + | ${BIN}/samtools-hybrid view -h ${OUT}/NA12878.exon.sample.merged.bam | less |
+ | |||
+ | 5. Mark Duplicate Reads | ||
+ | ${BIN}/superDeDuper -i ${OUT}/NA12878.exon.sample.merged.bam -o ${OUT}/NA12878.exon.sample.deduped.bam -v | ||
+ | |||
+ | 6. Visualize alignment to reference genome | ||
+ | ${BIN}/samtools-hybrid index ${OUT}/NA12878.exon.sample.deduped.bam | ||
+ | ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam ${REF}/human_g1k_v37_chr20.fa | ||
+ | |||
+ | * Type 'g', and 20:19989392 | ||
+ | * TYPE 'g', and 20:20032998 | ||
+ | |||
+ | 7. Use QPLOT to assess the quality | ||
+ | |||
+ | ${BIN}/qplot --plot ${OUT}/NA12878.exon.sample.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon.sample.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon.sample.deduped.bam | ||
+ | |||
+ | == Steps (Thursday) == | ||
+ | |||
+ | 0. SETTING UP ENVIRONMENTAL VARIABLES | ||
+ | |||
+ | setenv BIN /home/hyun/wed/bin | ||
+ | setenv IN /home/hyun/wed/input | ||
+ | setenv REF /home/hyun/wed/ref | ||
+ | |||
+ | setenv OUT ~/seq/wednesday/output | ||
+ | mkdir --p ${OUT} | ||
+ | |||
+ | 1. EXON-TARGETTED DATA : COMPUTING GENOTYPE LIKELHOOD FROM BAM FILES | ||
+ | |||
+ | ${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.deduped.bam > ${OUT}/NA12878.exon.sample.glf | ||
+ | |||
+ | 2. EXON-TARGETTED DATA : VIEW THE GENOTYPE LIKELIHOOD FORMAT | ||
+ | |||
+ | ${BIN}/samtools-hybrid glfview ${OUT}/NA12878.exon.sample.glf | less | ||
+ | |||
+ | TYPE 'q' to finish | ||
+ | |||
+ | 3. EXON-TARGETTED DATA : SINGLE-SAMPLE GENOTYPE CALLING using GLFSINGLE | ||
+ | |||
+ | ${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.exon.sample.glf -b ${OUT}/NA12878.exon.sample.vcf | ||
+ | |||
+ | 4. EXON-TARGETTED DATA : VIEW THE VCF FILES AND COUNT # OF SNPS | ||
+ | |||
+ | less ${OUT}/NA12878.exon.sample.vcf | ||
+ | |||
+ | grep -v ^# ${OUT}/NA12878.exon.sample.vcf | wc -l | ||
+ | |||
+ | 5. DEEP-COVERAGE GENOME : COMPUTE THE GENOTYPE LIKELIHOOD | ||
+ | |||
+ | ${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.highcov.sample.bam > ${OUT}/NA12878.highcov.sample.glf | ||
+ | |||
+ | 6. DEEP-COVERAGE GENOME : SINGLE-SAMPLE VARIANT CALLING | ||
+ | |||
+ | ${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.highcov.sample.glf -b ${OUT}/NA12878.highcov.sample.vcf | ||
+ | |||
+ | 7. DEEP-COVERAGE GENOME : VIEW THE VCF FILES AND COUNT # OF SNPS | ||
+ | |||
+ | less ${OUT}/NA12878.highcov.sample.vcf | ||
+ | |||
+ | 8. VIEW THE VCF FILES AND COUNT # OF SNPS | ||
+ | grep -v ^# ${OUT}/NA12878.highcov.sample.vcf | wc -l | ||
− | + | 9. EVALUATE OVERLAP BETWEEN THE TWO SETS OF VARIANT CALLS | |
− | |||
− | |||
− | + | cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | wc -l | |
− | ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam | + | |
− | + | cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | |
+ | |||
+ | 10. VIEW ACTUAL ALIGNMENT AT SNP POSITIONS | ||
+ | |||
+ | ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam ${REF}/human_g1k_v37_chr20.fa | ||
+ | |||
+ | TYPE g, and 20:19989392 | ||
+ | TYPE g, and 20:20032998 | ||
+ | TYPE g, and 20:20139952 | ||
+ | |||
+ | ${BIN}/samtools-hybrid tview ${IN}/NA12878.highcov.sample.bam ${REF}/human_g1k_v37_chr20.fa | ||
+ | |||
+ | TYPE g, and 20:19989392 | ||
+ | TYPE g, and 20:20032998 | ||
+ | TYPE g, and 20:20139952 | ||
+ | |||
+ | 11. SUMMARIZE VCF STATISTICS | ||
+ | |||
+ | perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.exon.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20 | ||
+ | |||
+ | perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.highcov.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20 | ||
+ | |||
+ | == Filtering Examples in multisample calling == | ||
+ | * [[Media:Filtering example unfiltered.pdf | (BEFORE FILTERING)]] | ||
+ | * [[Media:Filtering example.pdf | (AFTER FILTERING)]] | ||
+ | |||
+ | |||
+ | == Where can you download this software? == | ||
+ | samtools-hybrid, glfSingle, superDeDuper, qplot can be downloaded at: | ||
+ | https://github.com/statgen/statgen |
Latest revision as of 13:29, 10 March 2011
Overview
Below lists a sequence of practice mapping fastq files to bam files, performing variant calling and variout quality checks.
Steps (Wednesday)
Low-level Processing - Practical Slides (PDF)
0. SETTING UP ENVIRONMENTAL VARIABLES
setenv BIN /home/hyun/wed/bin setenv IN /home/hyun/wed/input setenv REF /home/hyun/wed/ref setenv OUT ~/seq/wednesday/output mkdir --p ${OUT}
1. Understanding FASTQ format
zcat ${IN}/NA12878.exon.sample.read1.fastq.gz | head zcat ${IN}/NA12878.exon.sample.read2.fastq.gz | head zcat ${IN}/NA12878.exon.sample.unpaired.fastq.gz | head
press q to quit
2. Align using BWA
${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.read1.fastq.gz > ${OUT}/NA12878.exon.sample.read1.fastq.gz.sai ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.read2.fastq.gz > ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.unpaired.fastq.gz > ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai
${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai ${IN}/NA12878.exon.sample.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted ${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa ${OUT}//NA12878.exon.sample.read1.fastq.gz.sai ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai ${IN}/NA12878.exon.sample.read1.fastq.gz ${IN}/NA12878.exon.sample.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.paired.bwa.sorted
3. Merge multiple BAMs into one
${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon.sample.merged.bam ${OUT}/NA12878.exon.sample.paired.bwa.sorted.bam ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted.bam
4. View SAM/BAM format
${BIN}/samtools-hybrid view -h ${OUT}/NA12878.exon.sample.merged.bam | less
5. Mark Duplicate Reads
${BIN}/superDeDuper -i ${OUT}/NA12878.exon.sample.merged.bam -o ${OUT}/NA12878.exon.sample.deduped.bam -v
6. Visualize alignment to reference genome
${BIN}/samtools-hybrid index ${OUT}/NA12878.exon.sample.deduped.bam ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam ${REF}/human_g1k_v37_chr20.fa
- Type 'g', and 20:19989392
- TYPE 'g', and 20:20032998
7. Use QPLOT to assess the quality
${BIN}/qplot --plot ${OUT}/NA12878.exon.sample.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon.sample.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon.sample.deduped.bam
Steps (Thursday)
0. SETTING UP ENVIRONMENTAL VARIABLES
setenv BIN /home/hyun/wed/bin setenv IN /home/hyun/wed/input setenv REF /home/hyun/wed/ref setenv OUT ~/seq/wednesday/output mkdir --p ${OUT}
1. EXON-TARGETTED DATA : COMPUTING GENOTYPE LIKELHOOD FROM BAM FILES
${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.deduped.bam > ${OUT}/NA12878.exon.sample.glf
2. EXON-TARGETTED DATA : VIEW THE GENOTYPE LIKELIHOOD FORMAT
${BIN}/samtools-hybrid glfview ${OUT}/NA12878.exon.sample.glf | less
TYPE 'q' to finish
3. EXON-TARGETTED DATA : SINGLE-SAMPLE GENOTYPE CALLING using GLFSINGLE
${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.exon.sample.glf -b ${OUT}/NA12878.exon.sample.vcf
4. EXON-TARGETTED DATA : VIEW THE VCF FILES AND COUNT # OF SNPS
less ${OUT}/NA12878.exon.sample.vcf
grep -v ^# ${OUT}/NA12878.exon.sample.vcf | wc -l
5. DEEP-COVERAGE GENOME : COMPUTE THE GENOTYPE LIKELIHOOD
${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.highcov.sample.bam > ${OUT}/NA12878.highcov.sample.glf
6. DEEP-COVERAGE GENOME : SINGLE-SAMPLE VARIANT CALLING
${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.highcov.sample.glf -b ${OUT}/NA12878.highcov.sample.vcf
7. DEEP-COVERAGE GENOME : VIEW THE VCF FILES AND COUNT # OF SNPS
less ${OUT}/NA12878.highcov.sample.vcf
8. VIEW THE VCF FILES AND COUNT # OF SNPS
grep -v ^# ${OUT}/NA12878.highcov.sample.vcf | wc -l
9. EVALUATE OVERLAP BETWEEN THE TWO SETS OF VARIANT CALLS
cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | wc -l cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d
10. VIEW ACTUAL ALIGNMENT AT SNP POSITIONS
${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam ${REF}/human_g1k_v37_chr20.fa
TYPE g, and 20:19989392 TYPE g, and 20:20032998 TYPE g, and 20:20139952
${BIN}/samtools-hybrid tview ${IN}/NA12878.highcov.sample.bam ${REF}/human_g1k_v37_chr20.fa
TYPE g, and 20:19989392 TYPE g, and 20:20032998 TYPE g, and 20:20139952
11. SUMMARIZE VCF STATISTICS
perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.exon.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20 perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.highcov.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20
Filtering Examples in multisample calling
Where can you download this software?
samtools-hybrid, glfSingle, superDeDuper, qplot can be downloaded at: https://github.com/statgen/statgen