Sequence Analysis Practice 2011/03/09

From Genome Analysis Wiki
Revision as of 14:47, 9 March 2011 by Hmkang (talk | contribs) (→‎Steps)
Jump to navigationJump to search

Overview

Below lists a sequence of practice mapping fastq files to bam files, performing variant calling and variout quality checks.

Steps

0. SETTING UP ENVIRONMENTAL VARIABLES

setenv BIN /home/hyun/wed/bin
setenv IN /home/hyun/wed/input
setenv REF /home/hyun/wed/ref

setenv OUT ~/seq/wednesday/output
mkdir --p ${OUT}

1. Align using BWA

${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai

${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai
 
${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai
 
${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted
 
${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa ${OUT}//NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted

or run

csh /home/hyun/wed/bin/sh/step1.sh 


2. MERGE ALIGNED BAMS INTO A SINGLE BAM

${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted.bam

3. BRIEF SUMMARY OF THE BAM

${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam

4. MARK DUPLICATED READS

${BIN}//superDeDuper -i ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam -o ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam -v

5. BRIEF SUMMARY OF THE BAM

${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam

6. VIEW THE ALIGNMENT

${BIN}/samtools-hybrid view  ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam | less

TYPE 'q' to finish

7. INDEX THE ALIGNMENT FOR RANDOM ACCESS OF THE BAM

${BIN}/samtools-hybrid index  ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam

8. GENOMIC VIEW OF THE ALIGNMENT

${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa

TYPE 'g' and 20:20000000 (7 consecutive zeros)


9. QUALITY CHECKING USING QPLOT

${BIN}/qplot --plot ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent  ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam

cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats

The PDF file can be viewed ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf

9. COMPUTE THE GENOTYPE LIKELIHOOD

${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf

${BIN}/samtools-hybrid glfview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf | less

TYPE 'q' to finish

10. SINGLE-SAMPLE GENOTYPE CALLING using GLFSINGLE

${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf -b ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf

less ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf

TYPE 'q' to finish

11. COMPUTE THE GENOTYPE LIKELIHOOD OF HIGH-COVERAGE DATA

${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.bam > ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.glf

12. SINGLE-SAMPLE GENOTYPE CALLING on the HIGH COVERAGE DATA

${BIN}/glfSingle --maxDepth 10000  --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.glf -b ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf

less ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf

TYPE 'q' to finish

13. COUNT NUMBER OF SNPs and OVERLAPS

grep -v ^# ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf | wc -l 

grep -v ^# ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | wc -l 
 
cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | wc -l

cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d 

${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa

TYPE g , and 20:19989392 TYPE g, and 20:20032998

${BIN}/samtools-hybrid tview ${IN}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.bam ${REF}/human_g1k_v37_chr20.fa

TYPE g , and 20:19989392 TYPE g, and 20:20032998

14. SUMMARIZE STATISTICS

perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20

perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20