Line 3: |
Line 3: |
| Below lists a sequence of practice mapping fastq files to bam files, performing variant calling and variout quality checks. | | Below lists a sequence of practice mapping fastq files to bam files, performing variant calling and variout quality checks. |
| | | |
− | == Steps == | + | == Steps (Wednesday) == |
| + | |
| + | [[Media:Hmkang-20110309-practical.pdf | Low-level Processing - Practical Slides (PDF)]] |
| | | |
| 0. SETTING UP ENVIRONMENTAL VARIABLES | | 0. SETTING UP ENVIRONMENTAL VARIABLES |
Line 11: |
Line 13: |
| setenv REF /home/hyun/wed/ref | | setenv REF /home/hyun/wed/ref |
| | | |
− | setenv OUT ~/seq/wednesday/output | + | setenv OUT ~/seq/wednesday/output |
| mkdir --p ${OUT} | | mkdir --p ${OUT} |
| | | |
− | 1. Align using BWA | + | 1. Understanding FASTQ format |
| + | |
| + | zcat ${IN}/NA12878.exon.sample.read1.fastq.gz | head |
| + | zcat ${IN}/NA12878.exon.sample.read2.fastq.gz | head |
| + | zcat ${IN}/NA12878.exon.sample.unpaired.fastq.gz | head |
| | | |
− | ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai | + | press q to quit |
| + | |
| + | 2. Align using BWA |
| + | |
| + | ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.read1.fastq.gz > ${OUT}/NA12878.exon.sample.read1.fastq.gz.sai |
| | | |
− | ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai | + | ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.read2.fastq.gz > ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai |
− |
| + | |
− | ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai | + | ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon.sample.unpaired.fastq.gz > ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai |
− |
| |
− | ${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted
| |
− |
| |
− | ${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa ${OUT}//NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted
| |
− |
| |
− | 2. MERGE ALIGNED BAMS INTO A SINGLE BAM
| |
| | | |
− | ${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted.bam | + | ${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai ${IN}/NA12878.exon.sample.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted |
| + | |
| + | ${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa ${OUT}//NA12878.exon.sample.read1.fastq.gz.sai ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai ${IN}/NA12878.exon.sample.read1.fastq.gz ${IN}/NA12878.exon.sample.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon.sample.paired.bwa.sorted |
| | | |
− | 3. BRIEF SUMMARY OF THE BAM | + | 3. Merge multiple BAMs into one |
| + | ${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon.sample.merged.bam ${OUT}/NA12878.exon.sample.paired.bwa.sorted.bam ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted.bam |
| | | |
− | ${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam
| |
| | | |
− | 4. MARK DUPLICATED READS | + | 4. View SAM/BAM format |
| + | ${BIN}/samtools-hybrid view -h ${OUT}/NA12878.exon.sample.merged.bam | less |
| | | |
− | ${BIN}//superDeDuper -i ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam -o ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam -v | + | 5. Mark Duplicate Reads |
| + | ${BIN}/superDeDuper -i ${OUT}/NA12878.exon.sample.merged.bam -o ${OUT}/NA12878.exon.sample.deduped.bam -v |
| | | |
− | 5. BRIEF SUMMARY OF THE BAM
| + | 6. Visualize alignment to reference genome |
| + | ${BIN}/samtools-hybrid index ${OUT}/NA12878.exon.sample.deduped.bam |
| + | ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam ${REF}/human_g1k_v37_chr20.fa |
| | | |
− | ${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam
| + | * Type 'g', and 20:19989392 |
| + | * TYPE 'g', and 20:20032998 |
| | | |
− | 6. VIEW THE ALIGNMENT
| + | 7. Use QPLOT to assess the quality |
| | | |
− | ${BIN}/samtools-hybrid view ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam | less | + | ${BIN}/qplot --plot ${OUT}/NA12878.exon.sample.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon.sample.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon.sample.deduped.bam |
| | | |
− | 7. INDEX THE ALIGNMENT FOR RANDOM ACCESS OF THE BAM
| + | == Steps (Thursday) == |
| | | |
− | ${BIN}/samtools-hybrid index ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam
| + | 0. SETTING UP ENVIRONMENTAL VARIABLES |
| | | |
− | 8. GENOMIC VIEW OF THE ALIGNMENT
| + | setenv BIN /home/hyun/wed/bin |
| + | setenv IN /home/hyun/wed/input |
| + | setenv REF /home/hyun/wed/ref |
| + | |
| + | setenv OUT ~/seq/wednesday/output |
| + | mkdir --p ${OUT} |
| | | |
− | ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa
| + | 1. EXON-TARGETTED DATA : COMPUTING GENOTYPE LIKELHOOD FROM BAM FILES |
| | | |
− | TYPE 'g' and 20:20000000
| + | ${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon.sample.deduped.bam > ${OUT}/NA12878.exon.sample.glf |
| | | |
| + | 2. EXON-TARGETTED DATA : VIEW THE GENOTYPE LIKELIHOOD FORMAT |
| + | |
| + | ${BIN}/samtools-hybrid glfview ${OUT}/NA12878.exon.sample.glf | less |
| | | |
− | 9. QUALITY CHECKING USING QPLOT
| + | TYPE 'q' to finish |
| | | |
− | ${BIN}/qplot --plot ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam
| + | 3. EXON-TARGETTED DATA : SINGLE-SAMPLE GENOTYPE CALLING using GLFSINGLE |
− |
| |
− | cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats
| |
| | | |
− | The PDF file can be viewed ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf
| + | ${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.exon.sample.glf -b ${OUT}/NA12878.exon.sample.vcf |
| | | |
− | 9. COMPUTE THE GENOTYPE LIKELIHOOD
| + | 4. EXON-TARGETTED DATA : VIEW THE VCF FILES AND COUNT # OF SNPS |
| | | |
− | ${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf | + | less ${OUT}/NA12878.exon.sample.vcf |
− |
| |
− | ${BIN}/samtools-hybrid glfview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf | less
| |
| | | |
− | 10. SINGLE-SAMPLE GENOTYPE CALLING using GLFSINGLE
| + | grep -v ^# ${OUT}/NA12878.exon.sample.vcf | wc -l |
| | | |
− | ${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf -b ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf
| + | 5. DEEP-COVERAGE GENOME : COMPUTE THE GENOTYPE LIKELIHOOD |
− |
| |
− | less ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf
| |
| | | |
− | 11. COMPUTE THE GENOTYPE LIKELIHOOD OF HIGH-COVERAGE DATA
| + | ${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.highcov.sample.bam > ${OUT}/NA12878.highcov.sample.glf |
| | | |
− | ${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.bam > ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.glf
| + | 6. DEEP-COVERAGE GENOME : SINGLE-SAMPLE VARIANT CALLING |
| | | |
− | 12. SINGLE-SAMPLE GENOTYPE CALLING on the HIGH COVERAGE DATA
| + | ${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.highcov.sample.glf -b ${OUT}/NA12878.highcov.sample.vcf |
| | | |
− | ${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.glf -b ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf
| + | 7. DEEP-COVERAGE GENOME : VIEW THE VCF FILES AND COUNT # OF SNPS |
| | | |
− | less ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | + | less ${OUT}/NA12878.highcov.sample.vcf |
| + | |
| + | 8. VIEW THE VCF FILES AND COUNT # OF SNPS |
| + | |
| + | grep -v ^# ${OUT}/NA12878.highcov.sample.vcf | wc -l |
| | | |
− | 13. COUNT NUMBER OF SNPs and OVERLAPS
| + | 9. EVALUATE OVERLAP BETWEEN THE TWO SETS OF VARIANT CALLS |
| | | |
− | grep -v ^# ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf | wc -l | + | cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | wc -l |
| | | |
− | grep -v ^# ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | wc -l
| + | cat ${OUT}/NA12878.exon.sample.vcf ${OUT}/NA12878.highcov.sample.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d |
− |
| + | |
− | cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | wc -l | + | 10. VIEW ACTUAL ALIGNMENT AT SNP POSITIONS |
− |
| |
− | cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d
| |
| | | |
− | ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa | + | ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam ${REF}/human_g1k_v37_chr20.fa |
| | | |
− | TYPE g , and 20:19989392 | + | TYPE g, and 20:19989392 |
| TYPE g, and 20:20032998 | | TYPE g, and 20:20032998 |
| + | TYPE g, and 20:20139952 |
| | | |
− | ${BIN}/samtools-hybrid tview ${IN}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.bam ${REF}/human_g1k_v37_chr20.fa | + | ${BIN}/samtools-hybrid tview ${IN}/NA12878.highcov.sample.bam ${REF}/human_g1k_v37_chr20.fa |
| | | |
− | TYPE g , and 20:19989392 | + | TYPE g, and 20:19989392 |
| TYPE g, and 20:20032998 | | TYPE g, and 20:20032998 |
| + | TYPE g, and 20:20139952 |
| | | |
− | 14. SUMMARIZE STATISTICS
| + | 11. SUMMARIZE VCF STATISTICS |
| | | |
− | perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20 | + | perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.exon.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20 |
| | | |
− | perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20 | + | perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.highcov.sample.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20 |
| + | |
| + | == Filtering Examples in multisample calling == |
| + | * [[Media:Filtering example unfiltered.pdf | (BEFORE FILTERING)]] |
| + | * [[Media:Filtering example.pdf | (AFTER FILTERING)]] |
| + | |
| + | |
| + | == Where can you download this software? == |
| + | samtools-hybrid, glfSingle, superDeDuper, qplot can be downloaded at: |
| + | https://github.com/statgen/statgen |