Changes

From Genome Analysis Wiki
Jump to navigationJump to search
no edit summary
Line 11: Line 11:  
  setenv REF /home/hyun/wed/ref
 
  setenv REF /home/hyun/wed/ref
 
   
 
   
  setenv OUT ~/seq/wednesday/output
+
  setenv OUT ~/seq/wednesday/output  
 
  mkdir --p ${OUT}
 
  mkdir --p ${OUT}
   −
1. Align using BWA
+
1. Understanding FASTQ format
   −
${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai
+
zcat ${IN}/NA12878.exon.sample.read1.fastq.gz | head
+
zcat ${IN}/NA12878.exon.sample.read2.fastq.gz | head
${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai
+
zcat ${IN}/NA12878.exon.sample.unpaited.fastq.gz | head
 
  −
${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai
  −
 
  −
${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted
  −
 
  −
${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa ${OUT}//NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz.sai ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz.sai ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read1.fastq.gz ${IN}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.read2.fastq.gz | ${BIN}/samtools-hybrid view -uhS - | ${BIN}/samtools-hybrid sort -m 10000000 - ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted
  −
 
  −
or run
  −
 
  −
sh ${OUT}/sh/step1.sh
  −
 
  −
2. MERGE ALIGNED BAMS INTO A SINGLE BAM
  −
 
  −
${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.paired.bwa.sorted.bam ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.unpaired.bwa.sorted.bam
  −
 
  −
3. BRIEF SUMMARY OF THE BAM
  −
 
  −
${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam
  −
 
  −
4. MARK DUPLICATED READS
  −
 
  −
${BIN}//superDeDuper -i ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.merged.bam -o ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam -v
  −
 
  −
5. BRIEF SUMMARY OF THE BAM
  −
 
  −
${BIN}/samtools-hybrid flagstat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam
     −
6. VIEW THE ALIGNMENT
+
press q to quit
   −
${BIN}/samtools-hybrid view  ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam | less
+
2. Align using BWA
   −
TYPE 'q' to finish
+
  ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa \
 
+
  ${IN}/NA12878.exon.sample.read1.fastq.gz \
7. INDEX THE ALIGNMENT FOR RANDOM ACCESS OF THE BAM
+
  > ${OUT}/NA12878.exon.sample.read1.fastq.gz.sai
 
  −
  ${BIN}/samtools-hybrid index  ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam
  −
 
  −
8. GENOMIC VIEW OF THE ALIGNMENT
  −
 
  −
${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa
  −
 
  −
TYPE 'g' and 20:20000000 (7 consecutive zeros)
  −
 
  −
 
  −
9. QUALITY CHECKING USING QPLOT
  −
 
  −
${BIN}/qplot --plot ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf --stats ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats --reference ${REF}/human_g1k_v37_chr20.fa --dbsnp ${REF}/dbsnp.b130.ncbi37.chr20.tbl --gccontent  ${REF}/ncbi37.chr20.gc ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam
   
   
 
   
cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.stats
+
  ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa \
 
+
  ${IN}/NA12878.exon.sample.read2.fastq.gz \
The PDF file can be viewed  ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam.qplot.pdf
+
  > ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai
 
  −
9. COMPUTE THE GENOTYPE LIKELIHOOD
  −
 
  −
  ${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam > ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf
   
   
 
   
  ${BIN}/samtools-hybrid glfview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf | less
+
  ${BIN}/bwa aln -q 15 ${REF}/human_g1k_v37_chr20.fa \
 +
    ${IN}/NA12878.exon.sample.unpaired.fastq.gz \
 +
    > ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai
   −
TYPE 'q' to finish
+
  ${BIN}/bwa samse ${REF}/human_g1k_v37_chr20.fa \
 
+
  ${OUT}/NA12878.exon.sample.unpaired.fastq.gz.sai \
10. SINGLE-SAMPLE GENOTYPE CALLING using GLFSINGLE
+
  ${IN}/NA12878.exon.sample.unpaired.fastq.gz \
 
+
  | ${BIN}/samtools-hybrid view -uhS - \
  ${BIN}/glfSingle --maxDepth 10000 --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.glf -b ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf
+
  | ${BIN}/samtools-hybrid sort -m 10000000 \
 +
  - ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted
 
   
 
   
  less ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf
+
  ${BIN}/bwa sampe ${REF}/human_g1k_v37_chr20.fa \
 +
  ${OUT}//NA12878.exon.sample.read1.fastq.gz.sai \
 +
  ${OUT}/NA12878.exon.sample.read2.fastq.gz.sai \
 +
  ${IN}/NA12878.exon.sample.read1.fastq.gz \
 +
  ${IN}/NA12878.exon.sample.read2.fastq.gz \
 +
  | ${BIN}/samtools-hybrid view -uhS - \
 +
  | ${BIN}/samtools-hybrid sort -m 10000000 \
 +
  - ${OUT}/NA12878.exon.sample.paired.bwa.sorted
   −
TYPE 'q' to finish
+
3. Merge multiple BAMs into one
 +
${BIN}/samtools-hybrid merge ${OUT}/NA12878.exon.sample.merged.bam \
 +
  ${OUT}/NA12878.exon.sample.paired.bwa.sorted.bam  \
 +
  ${OUT}/NA12878.exon.sample.unpaired.bwa.sorted.bam
   −
11. COMPUTE THE GENOTYPE LIKELIHOOD OF HIGH-COVERAGE DATA
     −
  ${BIN}/samtools-hybrid pileup -g -f ${REF}/human_g1k_v37_chr20.fa ${IN}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.bam > ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.glf
+
4. View SAM/BAM format
 +
  ${BIN}/samtools-hybrid view -h ${OUT}/NA12878.exon.sample.merged.bam | head -5
   −
12. SINGLE-SAMPLE GENOTYPE CALLING on the HIGH COVERAGE DATA
     −
  ${BIN}/glfSingle --maxDepth 10000  --minMapQuality 20 -p 0.9 -g ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.glf -b ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf
+
5. Mark Deuplicate Reads
+
  ${BIN}/superDeDuper -i ${OUT}/NA12878.exon.sample.merged.bam \
less ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf
+
  -o ${OUT}/NA12878.exon.sample.deduped.bam -v
 
  −
TYPE 'q' to finish
  −
 
  −
13. COUNT NUMBER OF SNPs and OVERLAPS
     −
grep -v ^# ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf | wc -l
+
6. Visualize alignment to reference genome
+
  ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon.sample.deduped.bam \
grep -v ^# ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | wc -l
+
    ${REF}/human_g1k_v37_chr20.fa
 
  −
cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d | wc -l
  −
  −
cat ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf | grep -v ^# | cut -f 1,2 | sort | uniq -d
  −
  −
  ${BIN}/samtools-hybrid tview ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.deduped.bam ${REF}/human_g1k_v37_chr20.fa
  −
  −
TYPE g , and  20:19989392
  −
TYPE g, and  20:20032998
  −
 
  −
${BIN}/samtools-hybrid tview ${IN}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.bam ${REF}/human_g1k_v37_chr20.fa
  −
 
  −
TYPE g , and  20:19989392
  −
TYPE g, and  20:20032998
  −
 
  −
14. SUMMARIZE STATISTICS
  −
 
  −
perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.exon-targetted.ILLUMINA.chr20.19986kb-20281kb.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20
  −
  −
perl ${BIN}/vcfSummary.pl --vcf ${OUT}/NA12878.high_coverage.ILLUMINA.chr20.19986kb-20281kb.vcf --dbsnp ${REF}/dbsnp_129_b37.rod.chr20.map --bfile ${REF}/hapmap3_r3_b37_fwd.consensus.qc.poly.chr20
 

Navigation menu