Changes

From Genome Analysis Wiki
Jump to navigationJump to search
17,716 bytes removed ,  00:43, 7 July 2011
no edit summary
Line 70: Line 70:  
* FILTER_ARGS needs to be carefully calibrated in order to obtain a good set of filtered set of SNPs
 
* FILTER_ARGS needs to be carefully calibrated in order to obtain a good set of filtered set of SNPs
   −
##################################################################
  −
# UMAKE CONFIGURATION FILE
  −
# This configuration file contains run-time configuration of
  −
# UMAKE SNP calling pipeline
  −
###############################################################################
  −
## KEY ELEMENTS TO CONFIGURE : NEED TO MODIFY
  −
###############################################################################
  −
UMAKE_ROOT = FULL_PATH_TO_UMAKE  ## e.g. /home/myid/code/umake
  −
INPUT_ROOT = FULL_PATH_TO_CURRENT_DIR ## e.g. /home/myid/data/umake-examples
  −
OUTPUT_ROOT = FULL_PATH_TO_OUTPUT_DIR ## e.g. /home/myid/data/umake-examples/out
  −
BAM_INDEX = $(INPUT_ROOT)/umake-example.index  # SAMPLE INDEX FILE (See documentation for detailed format)
  −
CHRS = 20              # List of chromosomes to call SNPs. For multiple chromosomes, separate by whitespace
  −
OUT_DIR = $(OUTPUT_ROOT)    # output directory
  −
OUT_PREFIX = umake-example  # prefix of output Makefile $(OUT_PREFIX).Makefile will be generated
  −
#PED_INDEX = $(INPUT_ROOT)/umake-example.ped    # SAMPLE PED FILE (required only for chrX calling)
  −
#
  −
###############################################################################
  −
## STEPS TO RUN : COMMENT OUT TO EXCLUDE CERTAIN STEPS
  −
##  --snpcall, --extract, --beagle, --thunder commands automatically set them
  −
###############################################################################
  −
#RUN_INDEX = TRUE        # create BAM index file
  −
#RUN_PILEUP = TRUE      # create GLF file from BAM
  −
#RUN_GLFMULTIPLES = TRUE # create unfiltered SNP calls
  −
#RUN_VCFPILEUP = TRUE    # create PVCF files using vcfPileup and run infoCollector
  −
#RUN_FILTER = TRUE      # filter SNPs using vcfCooker
  −
#RUN_SPLIT = TRUE        # split SNPs into chunks for genotype refinement
  −
#RUN_BEAGLE = TRUE  # BEAGLE - MUST SET AFTER FINISHING PREVIOUS STEPS
  −
#RUN_SUBSET = TRUE  # SUBSET FOR THUNDER - MAY BE SET WITH BEAGLE STEP TOGETHER
  −
#RUN_THUNDER = TRUE # THUNDER - MUST SET AFTER FINISHING PREVIOUS STEPS
  −
#
  −
###############################################################################
  −
## OPTIONS FOR GLFEXTRACT (GLFMULTIPLES, VCFPILEUP, FILTER MUST BE TURNED OFF)
  −
###############################################################################
  −
#RUN_EXTRACT = TRUE  # Instead of discovering SNPs, extract genotype liklihood in the site of VCF_EXTRACT
  −
#VCF_EXTRACT = # whole-genome (gzipped and tabixed) .vcf.gz file to extract the site information to genotype (such as 1000 Genomes site list)
  −
#
  −
###############################################################################
  −
## OPTIONS FOR EXOME/TARGETED SEQUENCING : COMMENT OUT IF WHOLE GENOME SEQUENCING
  −
###############################################################################
  −
WRITE_TARGET_LOCI = TRUE  # FOR TARGETED SEQUENCING ONLY -- Write loci file when performing pileup
  −
UNIFORM_TARGET_BED = $(INPUT_ROOT)/umake-example.bed # Targeted sequencing : When all individuals has the same target. Otherwise, comment it out
  −
OFFSET_OFF_TARGET = 50 # Extend target by given # of bases
  −
MULTIPLE_TARGET_MAP =  # Target per individual : Each line contains [SM_ID] [TARGET_BED]
  −
TARGET_DIR = target    # Directory to store target information
  −
SAMTOOLS_VIEW_TARGET_ONLY = TRUE # When performing samtools view, exclude off-target regions (may make command line too long)
  −
###############################################################################
  −
## RESOURCE FILES : Download the full resources for full genome calling
  −
###############################################################################
  −
REF = $(INPUT_ROOT)/data/ref/human_g1k_v37_chr20.fa # Reference FASTA sequence. Note that the FASTA file in the example package is only chr20.
  −
INDEL_PREFIX = $(INPUT_ROOT)/data/indels/1kg.pilot_release.merged.indels.sites.hg19 # 1000 Genomes Pilot 1 indel VCF prefix
  −
DBSNP_PREFIX =  $(INPUT_ROOT)/data/dbsnp/dbsnp_129_b37.rod # dbSNP file prefix
  −
HM3_PREFIX =  $(INPUT_ROOT)/data/HapMap/hapmap3_r3_b37_fwd.consensus.qc.poly # HapMap3 polymorphic site prefix
  −
###############################################################################
  −
## BINARIES
  −
###############################################################################
  −
SAMTOOLS_FOR_PILEUP = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools pileup
  −
SAMTOOLS_FOR_OTHERS = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools view and calmd
  −
GLFMERGE = $(UMAKE_ROOT)/bin/glfMerge # glfMerge when multiple BAMs exist per indvidual
  −
GLFMULTIPLES = $(UMAKE_ROOT)/bin/glfMultiples --minMapQuality 0 --minDepth 1 --maxDepth 10000000 --uniformTsTv --smartFilter # glfMultiples and options
  −
GLFEXTRACT = $(UMAKE_ROOT)/bin/glfExtract  # glfExtract for obtaining VCF for known sites
  −
VCFPILEUP = $(UMAKE_ROOT)/bin/vcfPileup    # vcfPileup to generate rich per-site information
  −
INFOCOLLECTOR = $(UMAKE_ROOT)/bin/infoCollector # create filtering statistics
  −
VCFMERGE = perl $(UMAKE_ROOT)/scripts/bams2vcfMerge.pl # merge multiple BAMs separated by chunk of genomes
  −
VCFCOOKER = $(UMAKE_ROOT)/bin/vcfCooker  # vcfCooker for filtering
  −
VCFSUMMARY = perl $(UMAKE_ROOT)/scripts/vcfSummary.pl # Get summary statistics of discovered site
  −
VCFSPLIT = perl $(UMAKE_ROOT)/scripts/vcfSplit.pl # split VCF into overlapping chunks for genotype refinement
  −
VCFPASTE = perl $(UMAKE_ROOT)/scripts/vcfPaste.pl # vcfPaste to generate filtered genotype VCF
  −
BEAGLE = java -Xmx4g -jar $(UMAKE_ROOT)/ext/beagle.20101226.jar seed=993478 gprobs=true niterations=50 lowmem=true # BEAGLE BINARY : NEED TO COPY BEAGLE TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
  −
VCF2BEAGLE = perl $(UMAKE_ROOT)/scripts/vcf2Beagle.pl --PL # convert VCF (with PL tag) into beagle input
  −
BEAGLE2VCF = perl $(UMAKE_ROOT)/scripts/beagle2Vcf.pl # convert beagle output to VCF
  −
THUNDER = $(UMAKE_ROOT)/bin/thunderVCF -r 30 --phase --dosage --compact --inputPhased # MaCH/Thunder genotype refinement step
  −
LIGATEVCF = perl $(UMAKE_ROOT)/scripts/ligateVcf.pl # ligate multiple phased VCFs while resolving the phase between VCFs
  −
BGZIP = $(UMAKE_ROOT)/ext/bgzip  # NEED TO COPY BGZIP TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
  −
TABIX = $(UMAKE_ROOT)/ext/tabix  # NEED TO COPY TABIX TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
  −
###############################################################################
  −
## ARGUMENT FOR FILTERING
  −
###############################################################################
  −
SAMTOOLS_VIEW_FILTER = -q 20 -F 0x0704 # samtools view filter (-q by MQ, -F by flag)
  −
FILTER_MAX_SAMPLE_DP = 20  # Max Depth per Sample (20x default) -- will generate FILTER_MAX_TOTAL_DP automatically
  −
FILTER_MIN_SAMPLE_DP = 0.5 # Min Depth per Sample (0.5x defaul) -- will generate FILTER_MIN_TOTAL_DP automatically
  −
FILTER_ARGS = --write-vcf --filter --maxDP $(FILTER_MAX_TOTAL_DP) --minDP $(FILTER_MIN_TOTAL_DP) --maxAB 70 --maxSTR 20 --minSTR -20 --winIndel 5 --maxSTZ 5 --minSTZ -5 --maxAOI 5 # arguments for filtering (refer to vcfCooker for details)
  −
#############################################################################
  −
## RELATIVE DIRECTORY UNDER OUT_DIR
  −
#############################################################################
  −
BAM_GLF_DIR = glfs/bams  # BAM level GLF
  −
SM_GLF_DIR = glfs/samples # sample level GLF (after glfMerge if necessary)
  −
VCF_DIR = vcfs            # unfiltered and filtered VCF
  −
PVCF_DIR = pvcfs          # vcfPileup results
  −
SPLIT_DIR = split        # chunks split to multiple overlappingpieces
  −
BEAGLE_DIR = beagle      # beagle output
  −
THUNDER_DIR = thunder    # MaCH/thunder output
  −
GLF_INDEX = glfIndex.ped  # glfMultiples/glfExtract index file info
  −
#############################################################################
  −
## OTHER OPTIONS
  −
#############################################################################
  −
UNIT_CHUNK = 5000000      # Chunk size of SNP calling : 5Mb is default
  −
LD_NSNPS = 10000          # Chunk size of genotype refinement : 10,000 SNPs
  −
LD_OVERLAP = 1000        # Overlapping # of SNPs between chinks : 1,000 SNPs
  −
RUN_INDEX_FORCE = FALSE  # Regenerate BAM index file even if it exists
  −
MERGE_BEFORE_FILTER = FALSE # Merge across the chromosome before filtering
  −
NOBAQ_SUBSTRINGS = SOLID  # Avoid BAQ if the BAM file contains the substring
  −
ASSERT_BAM_EXIST = FALSE  # Check if BAM file exists
  −
#############################################################################
  −
## CLUSTER SETTING : CURRENTLY COMPATIBLE WITH MOSIX PLATFORM
  −
#############################################################################
  −
MOS_PREFIX =    # PREFIX FOR MOSIX COMMAND (BLANK IF UNUSED)
  −
MOS_NODES =      # COMMA-SEPARATED LIST OF NODES TO SUBMIT JOBS
  −
REMOTE_PREFIX =  # REMOTE_PREFIX : Set if cluster node see the directory differently (e.g. /net/mymachine/[original-dir]
  −
host8-223:umake-examples hmkang$ cat umake-example.conf | perl -lne 'chomp; print " $_"'
  −
##################################################################
  −
# UMAKE CONFIGURATION FILE
  −
# This configuration file contains run-time configuration of
  −
# UMAKE SNP calling pipeline
  −
###############################################################################
  −
## KEY ELEMENTS TO CONFIGURE : NEED TO MODIFY
  −
###############################################################################
  −
#UMAKE_ROOT = FULL_PATH_TO_UMAKE  ## e.g. /home/myid/code/umake
  −
#INPUT_ROOT = FULL_PATH_TO_CURRENT_DIR ## e.g. /home/myid/data/umake-examples
  −
#OUTPUT_ROOT = FULL_PATH_TO_OUTPUT_DIR ## e.g. /home/myid/data/umake-examples/out
  −
BAM_INDEX = $(INPUT_ROOT)/umake-example.index  # SAMPLE INDEX FILE (See documentation for detailed format)
  −
CHRS = 20              # List of chromosomes to call SNPs. For multiple chromosomes, separate by whitespace
  −
OUT_DIR = $(OUTPUT_ROOT)    # output directory
  −
OUT_PREFIX = umake-example  # prefix of output Makefile $(OUT_PREFIX).Makefile will be generated
  −
#PED_INDEX = $(INPUT_ROOT)/umake-example.ped    # SAMPLE PED FILE (required only for chrX calling)
  −
#
  −
###############################################################################
  −
## STEPS TO RUN : COMMENT OUT TO EXCLUDE CERTAIN STEPS
  −
##  --snpcall, --extract, --beagle, --thunder commands automatically set them
  −
###############################################################################
  −
#RUN_INDEX = TRUE        # create BAM index file
  −
#RUN_PILEUP = TRUE      # create GLF file from BAM
  −
#RUN_GLFMULTIPLES = TRUE # create unfiltered SNP calls
  −
#RUN_VCFPILEUP = TRUE    # create PVCF files using vcfPileup and run infoCollector
  −
#RUN_FILTER = TRUE      # filter SNPs using vcfCooker
  −
#RUN_SPLIT = TRUE        # split SNPs into chunks for genotype refinement
  −
#RUN_BEAGLE = TRUE  # BEAGLE - MUST SET AFTER FINISHING PREVIOUS STEPS
  −
#RUN_SUBSET = TRUE  # SUBSET FOR THUNDER - MAY BE SET WITH BEAGLE STEP TOGETHER
  −
#RUN_THUNDER = TRUE # THUNDER - MUST SET AFTER FINISHING PREVIOUS STEPS
  −
#
  −
###############################################################################
  −
## OPTIONS FOR GLFEXTRACT (GLFMULTIPLES, VCFPILEUP, FILTER MUST BE TURNED OFF)
  −
###############################################################################
  −
#RUN_EXTRACT = TRUE  # Instead of discovering SNPs, extract genotype liklihood in the site of VCF_EXTRACT
  −
#VCF_EXTRACT = # whole-genome (gzipped and tabixed) .vcf.gz file to extract the site information to genotype (such as 1000 Genomes site list)
  −
#
  −
###############################################################################
  −
## OPTIONS FOR EXOME/TARGETED SEQUENCING : COMMENT OUT IF WHOLE GENOME SEQUENCING
  −
###############################################################################
  −
WRITE_TARGET_LOCI = TRUE  # FOR TARGETED SEQUENCING ONLY -- Write loci file when performing pileup
  −
UNIFORM_TARGET_BED = $(INPUT_ROOT)/umake-example.bed # Targeted sequencing : When all individuals has the same target. Otherwise, comment it out
  −
OFFSET_OFF_TARGET = 50 # Extend target by given # of bases
  −
MULTIPLE_TARGET_MAP =  # Target per individual : Each line contains [SM_ID] [TARGET_BED]
  −
TARGET_DIR = target    # Directory to store target information
  −
SAMTOOLS_VIEW_TARGET_ONLY = TRUE # When performing samtools view, exclude off-target regions (may make command line too long)
  −
###############################################################################
  −
## RESOURCE FILES : Download the full resources for full genome calling
  −
###############################################################################
  −
REF = $(INPUT_ROOT)/data/ref/human_g1k_v37_chr20.fa # Reference FASTA sequence. Note that the FASTA file in the example package is only chr20.
  −
INDEL_PREFIX = $(INPUT_ROOT)/data/indels/1kg.pilot_release.merged.indels.sites.hg19 # 1000 Genomes Pilot 1 indel VCF prefix
  −
DBSNP_PREFIX =  $(INPUT_ROOT)/data/dbsnp/dbsnp_129_b37.rod # dbSNP file prefix
  −
HM3_PREFIX =  $(INPUT_ROOT)/data/HapMap/hapmap3_r3_b37_fwd.consensus.qc.poly # HapMap3 polymorphic site prefix
  −
###############################################################################
  −
## BINARIES
  −
###############################################################################
  −
SAMTOOLS_FOR_PILEUP = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools pileup
  −
SAMTOOLS_FOR_OTHERS = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools view and calmd
  −
GLFMERGE = $(UMAKE_ROOT)/bin/glfMerge # glfMerge when multiple BAMs exist per indvidual
  −
GLFMULTIPLES = $(UMAKE_ROOT)/bin/glfMultiples --minMapQuality 0 --minDepth 1 --maxDepth 10000000 --uniformTsTv --smartFilter # glfMultiples and options
  −
GLFEXTRACT = $(UMAKE_ROOT)/bin/glfExtract  # glfExtract for obtaining VCF for known sites
  −
VCFPILEUP = $(UMAKE_ROOT)/bin/vcfPileup    # vcfPileup to generate rich per-site information
  −
INFOCOLLECTOR = $(UMAKE_ROOT)/bin/infoCollector # create filtering statistics
  −
VCFMERGE = perl $(UMAKE_ROOT)/scripts/bams2vcfMerge.pl # merge multiple BAMs separated by chunk of genomes
  −
VCFCOOKER = $(UMAKE_ROOT)/bin/vcfCooker  # vcfCooker for filtering
  −
VCFSUMMARY = perl $(UMAKE_ROOT)/scripts/vcfSummary.pl # Get summary statistics of discovered site
  −
VCFSPLIT = perl $(UMAKE_ROOT)/scripts/vcfSplit.pl # split VCF into overlapping chunks for genotype refinement
  −
VCFPASTE = perl $(UMAKE_ROOT)/scripts/vcfPaste.pl # vcfPaste to generate filtered genotype VCF
  −
BEAGLE = java -Xmx4g -jar $(UMAKE_ROOT)/ext/beagle.20101226.jar seed=993478 gprobs=true niterations=50 lowmem=true # BEAGLE BINARY : NEED TO COPY BEAGLE TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
  −
VCF2BEAGLE = perl $(UMAKE_ROOT)/scripts/vcf2Beagle.pl --PL # convert VCF (with PL tag) into beagle input
  −
BEAGLE2VCF = perl $(UMAKE_ROOT)/scripts/beagle2Vcf.pl # convert beagle output to VCF
  −
THUNDER = $(UMAKE_ROOT)/bin/thunderVCF -r 30 --phase --dosage --compact --inputPhased # MaCH/Thunder genotype refinement step
  −
LIGATEVCF = perl $(UMAKE_ROOT)/scripts/ligateVcf.pl # ligate multiple phased VCFs while resolving the phase between VCFs
  −
BGZIP = $(UMAKE_ROOT)/ext/bgzip  # NEED TO COPY BGZIP TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
  −
TABIX = $(UMAKE_ROOT)/ext/tabix  # NEED TO COPY TABIX TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
  −
###############################################################################
  −
## ARGUMENT FOR FILTERING
  −
###############################################################################
  −
SAMTOOLS_VIEW_FILTER = -q 20 -F 0x0704 # samtools view filter (-q by MQ, -F by flag)
  −
FILTER_MAX_SAMPLE_DP = 20  # Max Depth per Sample (20x default) -- will generate FILTER_MAX_TOTAL_DP automatically
  −
FILTER_MIN_SAMPLE_DP = 0.5 # Min Depth per Sample (0.5x defaul) -- will generate FILTER_MIN_TOTAL_DP automatically
  −
FILTER_ARGS = --write-vcf --filter --maxDP $(FILTER_MAX_TOTAL_DP) --minDP $(FILTER_MIN_TOTAL_DP) --maxAB 70 --maxSTR 20 --minSTR -20 --winIndel 5 --maxSTZ 5 --minSTZ -5 --maxAOI 5 # arguments for filtering (refer to vcfCooker for details)
  −
#############################################################################
  −
## RELATIVE DIRECTORY UNDER OUT_DIR
  −
#############################################################################
  −
BAM_GLF_DIR = glfs/bams  # BAM level GLF
  −
SM_GLF_DIR = glfs/samples # sample level GLF (after glfMerge if necessary)
  −
VCF_DIR = vcfs            # unfiltered and filtered VCF
  −
PVCF_DIR = pvcfs          # vcfPileup results
  −
SPLIT_DIR = split        # chunks split to multiple overlappingpieces
  −
BEAGLE_DIR = beagle      # beagle output
  −
THUNDER_DIR = thunder    # MaCH/thunder output
  −
GLF_INDEX = glfIndex.ped  # glfMultiples/glfExtract index file info
  −
#############################################################################
  −
## OTHER OPTIONS
  −
#############################################################################
  −
UNIT_CHUNK = 5000000      # Chunk size of SNP calling : 5Mb is default
  −
LD_NSNPS = 10000          # Chunk size of genotype refinement : 10,000 SNPs
  −
LD_OVERLAP = 1000        # Overlapping # of SNPs between chinks : 1,000 SNPs
  −
RUN_INDEX_FORCE = FALSE  # Regenerate BAM index file even if it exists
  −
MERGE_BEFORE_FILTER = FALSE # Merge across the chromosome before filtering
  −
NOBAQ_SUBSTRINGS = SOLID  # Avoid BAQ if the BAM file contains the substring
  −
ASSERT_BAM_EXIST = FALSE  # Check if BAM file exists
  −
#############################################################################
  −
## CLUSTER SETTING : CURRENTLY COMPATIBLE WITH MOSIX PLATFORM
  −
#############################################################################
  −
MOS_PREFIX =    # PREFIX FOR MOSIX COMMAND (BLANK IF UNUSED)
  −
MOS_NODES =      # COMMA-SEPARATED LIST OF NODES TO SUBMIT JOBS
  −
REMOTE_PREFIX =  # REMOTE_PREFIX : Set if cluster node see the directory differently (e.g. /net/mymachine/[original-dir])
  −
host8-223:umake-examples hmkang$ cat umake-example.conf | perl -lne 'chomp; print " $_"'
   
  ##################################################################
 
  ##################################################################
 
  # UMAKE CONFIGURATION FILE
 
  # UMAKE CONFIGURATION FILE

Navigation menu