Difference between revisions of "Sequence Analysis Practice 2011/03/09"
(→Steps) |
(→Steps) |
||
Line 5: | Line 5: | ||
== Steps == | == Steps == | ||
− | + | 0. SETTING UP ENVIRONMENTAL VARIABLES | |
setenv BIN /home/hyun/wed/bin2 | setenv BIN /home/hyun/wed/bin2 | ||
Line 14: | Line 14: | ||
mkdir --p ${OUT} | mkdir --p ${OUT} | ||
− | + | 1. Align using BWA | |
${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai | ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai | ||
${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai | ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai | ||
− | + | ||
${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai | ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai | ||
− | + | ||
${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted | ${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted | ||
− | + | ||
${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa ${OUT}//NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted | ${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa ${OUT}//NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted | ||
− | + | ||
− | + | 2. MERGE ALIGNED BAMS INTO A SINGLE BAM | |
− | ${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted.bam | + | ${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted.bam |
3. BRIEF SUMMARY OF THE BAM | 3. BRIEF SUMMARY OF THE BAM | ||
− | ${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam | + | ${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam |
4. MARK DUPLICATED READS | 4. MARK DUPLICATED READS | ||
− | ${BIN}//superDeDuper -i ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam -o ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam -v | + | ${BIN}//superDeDuper -i ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam -o ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam -v |
5. BRIEF SUMMARY OF THE BAM | 5. BRIEF SUMMARY OF THE BAM | ||
− | ${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam | + | ${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam |
6. VIEW THE ALIGNMENT | 6. VIEW THE ALIGNMENT | ||
− | ${BIN}/samtools-hybrid view ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam | less | + | ${BIN}/samtools-hybrid view ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam | less |
7. INDEX THE ALIGNMENT FOR RANDOM ACCESS OF THE BAM | 7. INDEX THE ALIGNMENT FOR RANDOM ACCESS OF THE BAM | ||
− | ${BIN}/samtools-hybrid index ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam | + | ${BIN}/samtools-hybrid index ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam |
8. GENOMIC VIEW OF THE ALIGNMENT | 8. GENOMIC VIEW OF THE ALIGNMENT | ||
− | ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa | + | ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa |
9. QUALITY CHECKING USING QPLOT | 9. QUALITY CHECKING USING QPLOT | ||
− | ${BIN}/qplot --plot ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam | + | ${BIN}/qplot --plot ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam |
− | + | ||
− | cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats | + | cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats |
− | + | The PDF file can be viewed ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf | |
9. COMPUTE THE GENOTYPE LIKELIHOOD | 9. COMPUTE THE GENOTYPE LIKELIHOOD |
Revision as of 13:43, 9 March 2011
Overview
Below lists a sequence of practice mapping fastq files to bam files, performing variant calling and variout quality checks.
Steps
0. SETTING UP ENVIRONMENTAL VARIABLES
setenv BIN /home/hyun/wed/bin2 setenv IN /home/hyun/wed/input2 setenv REF /home/hyun/wed/ref2
setenv OUT ~/seq/wednesday/output mkdir --p ${OUT}
1. Align using BWA
${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai ${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted ${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa ${OUT}//NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted
2. MERGE ALIGNED BAMS INTO A SINGLE BAM
${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted.bam
3. BRIEF SUMMARY OF THE BAM
${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam
4. MARK DUPLICATED READS
${BIN}//superDeDuper -i ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam -o ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam -v
5. BRIEF SUMMARY OF THE BAM
${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam
6. VIEW THE ALIGNMENT
${BIN}/samtools-hybrid view ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam | less
7. INDEX THE ALIGNMENT FOR RANDOM ACCESS OF THE BAM
${BIN}/samtools-hybrid index ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam
8. GENOMIC VIEW OF THE ALIGNMENT
${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa
9. QUALITY CHECKING USING QPLOT
${BIN}/qplot --plot ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats
The PDF file can be viewed ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf
9. COMPUTE THE GENOTYPE LIKELIHOOD
${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf
${BIN}/samtools-hybrid glfview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf | less
10. SINGLE-SAMPLE GENOTYPE CALLING using GLFSINGLE
${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf -b ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf
less ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf
11. COMPUTE THE GENOTYPE LIKELIHOOD OF HIGH-COVERAGE DATA
${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.bam > ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.glf
12. SINGLE-SAMPLE GENOTYPE CALLING on the HIGH COVERAGE DATA
${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.glf -b ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf
less ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf
13. COUNT NUMBER OF SNPs and OVERLAPS
grep -v ^# ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf | wc -l
grep -v ^# ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | wc -l
cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | wc -l
cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d
${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa
type g , and 20:19989392 type g, and 20:20032998
${BIN}/samtools-hybrid tview ${IN}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.bam ${REF}/human_g1k_v37_chr20.fa
type g , and 20:19989392 type g, and 20:20032998
14. SUMMARIZE STATISTICS
perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20
perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20