Line 70: |
Line 70: |
| * FILTER_ARGS needs to be carefully calibrated in order to obtain a good set of filtered set of SNPs | | * FILTER_ARGS needs to be carefully calibrated in order to obtain a good set of filtered set of SNPs |
| | | |
− | ##################################################################
| |
− | # UMAKE CONFIGURATION FILE
| |
− | # This configuration file contains run-time configuration of
| |
− | # UMAKE SNP calling pipeline
| |
− | ###############################################################################
| |
− | ## KEY ELEMENTS TO CONFIGURE : NEED TO MODIFY
| |
− | ###############################################################################
| |
− | UMAKE_ROOT = FULL_PATH_TO_UMAKE ## e.g. /home/myid/code/umake
| |
− | INPUT_ROOT = FULL_PATH_TO_CURRENT_DIR ## e.g. /home/myid/data/umake-examples
| |
− | OUTPUT_ROOT = FULL_PATH_TO_OUTPUT_DIR ## e.g. /home/myid/data/umake-examples/out
| |
− | BAM_INDEX = $(INPUT_ROOT)/umake-example.index # SAMPLE INDEX FILE (See documentation for detailed format)
| |
− | CHRS = 20 # List of chromosomes to call SNPs. For multiple chromosomes, separate by whitespace
| |
− | OUT_DIR = $(OUTPUT_ROOT) # output directory
| |
− | OUT_PREFIX = umake-example # prefix of output Makefile $(OUT_PREFIX).Makefile will be generated
| |
− | #PED_INDEX = $(INPUT_ROOT)/umake-example.ped # SAMPLE PED FILE (required only for chrX calling)
| |
− | #
| |
− | ###############################################################################
| |
− | ## STEPS TO RUN : COMMENT OUT TO EXCLUDE CERTAIN STEPS
| |
− | ## --snpcall, --extract, --beagle, --thunder commands automatically set them
| |
− | ###############################################################################
| |
− | #RUN_INDEX = TRUE # create BAM index file
| |
− | #RUN_PILEUP = TRUE # create GLF file from BAM
| |
− | #RUN_GLFMULTIPLES = TRUE # create unfiltered SNP calls
| |
− | #RUN_VCFPILEUP = TRUE # create PVCF files using vcfPileup and run infoCollector
| |
− | #RUN_FILTER = TRUE # filter SNPs using vcfCooker
| |
− | #RUN_SPLIT = TRUE # split SNPs into chunks for genotype refinement
| |
− | #RUN_BEAGLE = TRUE # BEAGLE - MUST SET AFTER FINISHING PREVIOUS STEPS
| |
− | #RUN_SUBSET = TRUE # SUBSET FOR THUNDER - MAY BE SET WITH BEAGLE STEP TOGETHER
| |
− | #RUN_THUNDER = TRUE # THUNDER - MUST SET AFTER FINISHING PREVIOUS STEPS
| |
− | #
| |
− | ###############################################################################
| |
− | ## OPTIONS FOR GLFEXTRACT (GLFMULTIPLES, VCFPILEUP, FILTER MUST BE TURNED OFF)
| |
− | ###############################################################################
| |
− | #RUN_EXTRACT = TRUE # Instead of discovering SNPs, extract genotype liklihood in the site of VCF_EXTRACT
| |
− | #VCF_EXTRACT = # whole-genome (gzipped and tabixed) .vcf.gz file to extract the site information to genotype (such as 1000 Genomes site list)
| |
− | #
| |
− | ###############################################################################
| |
− | ## OPTIONS FOR EXOME/TARGETED SEQUENCING : COMMENT OUT IF WHOLE GENOME SEQUENCING
| |
− | ###############################################################################
| |
− | WRITE_TARGET_LOCI = TRUE # FOR TARGETED SEQUENCING ONLY -- Write loci file when performing pileup
| |
− | UNIFORM_TARGET_BED = $(INPUT_ROOT)/umake-example.bed # Targeted sequencing : When all individuals has the same target. Otherwise, comment it out
| |
− | OFFSET_OFF_TARGET = 50 # Extend target by given # of bases
| |
− | MULTIPLE_TARGET_MAP = # Target per individual : Each line contains [SM_ID] [TARGET_BED]
| |
− | TARGET_DIR = target # Directory to store target information
| |
− | SAMTOOLS_VIEW_TARGET_ONLY = TRUE # When performing samtools view, exclude off-target regions (may make command line too long)
| |
− | ###############################################################################
| |
− | ## RESOURCE FILES : Download the full resources for full genome calling
| |
− | ###############################################################################
| |
− | REF = $(INPUT_ROOT)/data/ref/human_g1k_v37_chr20.fa # Reference FASTA sequence. Note that the FASTA file in the example package is only chr20.
| |
− | INDEL_PREFIX = $(INPUT_ROOT)/data/indels/1kg.pilot_release.merged.indels.sites.hg19 # 1000 Genomes Pilot 1 indel VCF prefix
| |
− | DBSNP_PREFIX = $(INPUT_ROOT)/data/dbsnp/dbsnp_129_b37.rod # dbSNP file prefix
| |
− | HM3_PREFIX = $(INPUT_ROOT)/data/HapMap/hapmap3_r3_b37_fwd.consensus.qc.poly # HapMap3 polymorphic site prefix
| |
− | ###############################################################################
| |
− | ## BINARIES
| |
− | ###############################################################################
| |
− | SAMTOOLS_FOR_PILEUP = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools pileup
| |
− | SAMTOOLS_FOR_OTHERS = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools view and calmd
| |
− | GLFMERGE = $(UMAKE_ROOT)/bin/glfMerge # glfMerge when multiple BAMs exist per indvidual
| |
− | GLFMULTIPLES = $(UMAKE_ROOT)/bin/glfMultiples --minMapQuality 0 --minDepth 1 --maxDepth 10000000 --uniformTsTv --smartFilter # glfMultiples and options
| |
− | GLFEXTRACT = $(UMAKE_ROOT)/bin/glfExtract # glfExtract for obtaining VCF for known sites
| |
− | VCFPILEUP = $(UMAKE_ROOT)/bin/vcfPileup # vcfPileup to generate rich per-site information
| |
− | INFOCOLLECTOR = $(UMAKE_ROOT)/bin/infoCollector # create filtering statistics
| |
− | VCFMERGE = perl $(UMAKE_ROOT)/scripts/bams2vcfMerge.pl # merge multiple BAMs separated by chunk of genomes
| |
− | VCFCOOKER = $(UMAKE_ROOT)/bin/vcfCooker # vcfCooker for filtering
| |
− | VCFSUMMARY = perl $(UMAKE_ROOT)/scripts/vcfSummary.pl # Get summary statistics of discovered site
| |
− | VCFSPLIT = perl $(UMAKE_ROOT)/scripts/vcfSplit.pl # split VCF into overlapping chunks for genotype refinement
| |
− | VCFPASTE = perl $(UMAKE_ROOT)/scripts/vcfPaste.pl # vcfPaste to generate filtered genotype VCF
| |
− | BEAGLE = java -Xmx4g -jar $(UMAKE_ROOT)/ext/beagle.20101226.jar seed=993478 gprobs=true niterations=50 lowmem=true # BEAGLE BINARY : NEED TO COPY BEAGLE TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
| |
− | VCF2BEAGLE = perl $(UMAKE_ROOT)/scripts/vcf2Beagle.pl --PL # convert VCF (with PL tag) into beagle input
| |
− | BEAGLE2VCF = perl $(UMAKE_ROOT)/scripts/beagle2Vcf.pl # convert beagle output to VCF
| |
− | THUNDER = $(UMAKE_ROOT)/bin/thunderVCF -r 30 --phase --dosage --compact --inputPhased # MaCH/Thunder genotype refinement step
| |
− | LIGATEVCF = perl $(UMAKE_ROOT)/scripts/ligateVcf.pl # ligate multiple phased VCFs while resolving the phase between VCFs
| |
− | BGZIP = $(UMAKE_ROOT)/ext/bgzip # NEED TO COPY BGZIP TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
| |
− | TABIX = $(UMAKE_ROOT)/ext/tabix # NEED TO COPY TABIX TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
| |
− | ###############################################################################
| |
− | ## ARGUMENT FOR FILTERING
| |
− | ###############################################################################
| |
− | SAMTOOLS_VIEW_FILTER = -q 20 -F 0x0704 # samtools view filter (-q by MQ, -F by flag)
| |
− | FILTER_MAX_SAMPLE_DP = 20 # Max Depth per Sample (20x default) -- will generate FILTER_MAX_TOTAL_DP automatically
| |
− | FILTER_MIN_SAMPLE_DP = 0.5 # Min Depth per Sample (0.5x defaul) -- will generate FILTER_MIN_TOTAL_DP automatically
| |
− | FILTER_ARGS = --write-vcf --filter --maxDP $(FILTER_MAX_TOTAL_DP) --minDP $(FILTER_MIN_TOTAL_DP) --maxAB 70 --maxSTR 20 --minSTR -20 --winIndel 5 --maxSTZ 5 --minSTZ -5 --maxAOI 5 # arguments for filtering (refer to vcfCooker for details)
| |
− | #############################################################################
| |
− | ## RELATIVE DIRECTORY UNDER OUT_DIR
| |
− | #############################################################################
| |
− | BAM_GLF_DIR = glfs/bams # BAM level GLF
| |
− | SM_GLF_DIR = glfs/samples # sample level GLF (after glfMerge if necessary)
| |
− | VCF_DIR = vcfs # unfiltered and filtered VCF
| |
− | PVCF_DIR = pvcfs # vcfPileup results
| |
− | SPLIT_DIR = split # chunks split to multiple overlappingpieces
| |
− | BEAGLE_DIR = beagle # beagle output
| |
− | THUNDER_DIR = thunder # MaCH/thunder output
| |
− | GLF_INDEX = glfIndex.ped # glfMultiples/glfExtract index file info
| |
− | #############################################################################
| |
− | ## OTHER OPTIONS
| |
− | #############################################################################
| |
− | UNIT_CHUNK = 5000000 # Chunk size of SNP calling : 5Mb is default
| |
− | LD_NSNPS = 10000 # Chunk size of genotype refinement : 10,000 SNPs
| |
− | LD_OVERLAP = 1000 # Overlapping # of SNPs between chinks : 1,000 SNPs
| |
− | RUN_INDEX_FORCE = FALSE # Regenerate BAM index file even if it exists
| |
− | MERGE_BEFORE_FILTER = FALSE # Merge across the chromosome before filtering
| |
− | NOBAQ_SUBSTRINGS = SOLID # Avoid BAQ if the BAM file contains the substring
| |
− | ASSERT_BAM_EXIST = FALSE # Check if BAM file exists
| |
− | #############################################################################
| |
− | ## CLUSTER SETTING : CURRENTLY COMPATIBLE WITH MOSIX PLATFORM
| |
− | #############################################################################
| |
− | MOS_PREFIX = # PREFIX FOR MOSIX COMMAND (BLANK IF UNUSED)
| |
− | MOS_NODES = # COMMA-SEPARATED LIST OF NODES TO SUBMIT JOBS
| |
− | REMOTE_PREFIX = # REMOTE_PREFIX : Set if cluster node see the directory differently (e.g. /net/mymachine/[original-dir]
| |
− | host8-223:umake-examples hmkang$ cat umake-example.conf | perl -lne 'chomp; print " $_"'
| |
− | ##################################################################
| |
− | # UMAKE CONFIGURATION FILE
| |
− | # This configuration file contains run-time configuration of
| |
− | # UMAKE SNP calling pipeline
| |
− | ###############################################################################
| |
− | ## KEY ELEMENTS TO CONFIGURE : NEED TO MODIFY
| |
− | ###############################################################################
| |
− | #UMAKE_ROOT = FULL_PATH_TO_UMAKE ## e.g. /home/myid/code/umake
| |
− | #INPUT_ROOT = FULL_PATH_TO_CURRENT_DIR ## e.g. /home/myid/data/umake-examples
| |
− | #OUTPUT_ROOT = FULL_PATH_TO_OUTPUT_DIR ## e.g. /home/myid/data/umake-examples/out
| |
− | BAM_INDEX = $(INPUT_ROOT)/umake-example.index # SAMPLE INDEX FILE (See documentation for detailed format)
| |
− | CHRS = 20 # List of chromosomes to call SNPs. For multiple chromosomes, separate by whitespace
| |
− | OUT_DIR = $(OUTPUT_ROOT) # output directory
| |
− | OUT_PREFIX = umake-example # prefix of output Makefile $(OUT_PREFIX).Makefile will be generated
| |
− | #PED_INDEX = $(INPUT_ROOT)/umake-example.ped # SAMPLE PED FILE (required only for chrX calling)
| |
− | #
| |
− | ###############################################################################
| |
− | ## STEPS TO RUN : COMMENT OUT TO EXCLUDE CERTAIN STEPS
| |
− | ## --snpcall, --extract, --beagle, --thunder commands automatically set them
| |
− | ###############################################################################
| |
− | #RUN_INDEX = TRUE # create BAM index file
| |
− | #RUN_PILEUP = TRUE # create GLF file from BAM
| |
− | #RUN_GLFMULTIPLES = TRUE # create unfiltered SNP calls
| |
− | #RUN_VCFPILEUP = TRUE # create PVCF files using vcfPileup and run infoCollector
| |
− | #RUN_FILTER = TRUE # filter SNPs using vcfCooker
| |
− | #RUN_SPLIT = TRUE # split SNPs into chunks for genotype refinement
| |
− | #RUN_BEAGLE = TRUE # BEAGLE - MUST SET AFTER FINISHING PREVIOUS STEPS
| |
− | #RUN_SUBSET = TRUE # SUBSET FOR THUNDER - MAY BE SET WITH BEAGLE STEP TOGETHER
| |
− | #RUN_THUNDER = TRUE # THUNDER - MUST SET AFTER FINISHING PREVIOUS STEPS
| |
− | #
| |
− | ###############################################################################
| |
− | ## OPTIONS FOR GLFEXTRACT (GLFMULTIPLES, VCFPILEUP, FILTER MUST BE TURNED OFF)
| |
− | ###############################################################################
| |
− | #RUN_EXTRACT = TRUE # Instead of discovering SNPs, extract genotype liklihood in the site of VCF_EXTRACT
| |
− | #VCF_EXTRACT = # whole-genome (gzipped and tabixed) .vcf.gz file to extract the site information to genotype (such as 1000 Genomes site list)
| |
− | #
| |
− | ###############################################################################
| |
− | ## OPTIONS FOR EXOME/TARGETED SEQUENCING : COMMENT OUT IF WHOLE GENOME SEQUENCING
| |
− | ###############################################################################
| |
− | WRITE_TARGET_LOCI = TRUE # FOR TARGETED SEQUENCING ONLY -- Write loci file when performing pileup
| |
− | UNIFORM_TARGET_BED = $(INPUT_ROOT)/umake-example.bed # Targeted sequencing : When all individuals has the same target. Otherwise, comment it out
| |
− | OFFSET_OFF_TARGET = 50 # Extend target by given # of bases
| |
− | MULTIPLE_TARGET_MAP = # Target per individual : Each line contains [SM_ID] [TARGET_BED]
| |
− | TARGET_DIR = target # Directory to store target information
| |
− | SAMTOOLS_VIEW_TARGET_ONLY = TRUE # When performing samtools view, exclude off-target regions (may make command line too long)
| |
− | ###############################################################################
| |
− | ## RESOURCE FILES : Download the full resources for full genome calling
| |
− | ###############################################################################
| |
− | REF = $(INPUT_ROOT)/data/ref/human_g1k_v37_chr20.fa # Reference FASTA sequence. Note that the FASTA file in the example package is only chr20.
| |
− | INDEL_PREFIX = $(INPUT_ROOT)/data/indels/1kg.pilot_release.merged.indels.sites.hg19 # 1000 Genomes Pilot 1 indel VCF prefix
| |
− | DBSNP_PREFIX = $(INPUT_ROOT)/data/dbsnp/dbsnp_129_b37.rod # dbSNP file prefix
| |
− | HM3_PREFIX = $(INPUT_ROOT)/data/HapMap/hapmap3_r3_b37_fwd.consensus.qc.poly # HapMap3 polymorphic site prefix
| |
− | ###############################################################################
| |
− | ## BINARIES
| |
− | ###############################################################################
| |
− | SAMTOOLS_FOR_PILEUP = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools pileup
| |
− | SAMTOOLS_FOR_OTHERS = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools view and calmd
| |
− | GLFMERGE = $(UMAKE_ROOT)/bin/glfMerge # glfMerge when multiple BAMs exist per indvidual
| |
− | GLFMULTIPLES = $(UMAKE_ROOT)/bin/glfMultiples --minMapQuality 0 --minDepth 1 --maxDepth 10000000 --uniformTsTv --smartFilter # glfMultiples and options
| |
− | GLFEXTRACT = $(UMAKE_ROOT)/bin/glfExtract # glfExtract for obtaining VCF for known sites
| |
− | VCFPILEUP = $(UMAKE_ROOT)/bin/vcfPileup # vcfPileup to generate rich per-site information
| |
− | INFOCOLLECTOR = $(UMAKE_ROOT)/bin/infoCollector # create filtering statistics
| |
− | VCFMERGE = perl $(UMAKE_ROOT)/scripts/bams2vcfMerge.pl # merge multiple BAMs separated by chunk of genomes
| |
− | VCFCOOKER = $(UMAKE_ROOT)/bin/vcfCooker # vcfCooker for filtering
| |
− | VCFSUMMARY = perl $(UMAKE_ROOT)/scripts/vcfSummary.pl # Get summary statistics of discovered site
| |
− | VCFSPLIT = perl $(UMAKE_ROOT)/scripts/vcfSplit.pl # split VCF into overlapping chunks for genotype refinement
| |
− | VCFPASTE = perl $(UMAKE_ROOT)/scripts/vcfPaste.pl # vcfPaste to generate filtered genotype VCF
| |
− | BEAGLE = java -Xmx4g -jar $(UMAKE_ROOT)/ext/beagle.20101226.jar seed=993478 gprobs=true niterations=50 lowmem=true # BEAGLE BINARY : NEED TO COPY BEAGLE TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
| |
− | VCF2BEAGLE = perl $(UMAKE_ROOT)/scripts/vcf2Beagle.pl --PL # convert VCF (with PL tag) into beagle input
| |
− | BEAGLE2VCF = perl $(UMAKE_ROOT)/scripts/beagle2Vcf.pl # convert beagle output to VCF
| |
− | THUNDER = $(UMAKE_ROOT)/bin/thunderVCF -r 30 --phase --dosage --compact --inputPhased # MaCH/Thunder genotype refinement step
| |
− | LIGATEVCF = perl $(UMAKE_ROOT)/scripts/ligateVcf.pl # ligate multiple phased VCFs while resolving the phase between VCFs
| |
− | BGZIP = $(UMAKE_ROOT)/ext/bgzip # NEED TO COPY BGZIP TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
| |
− | TABIX = $(UMAKE_ROOT)/ext/tabix # NEED TO COPY TABIX TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE
| |
− | ###############################################################################
| |
− | ## ARGUMENT FOR FILTERING
| |
− | ###############################################################################
| |
− | SAMTOOLS_VIEW_FILTER = -q 20 -F 0x0704 # samtools view filter (-q by MQ, -F by flag)
| |
− | FILTER_MAX_SAMPLE_DP = 20 # Max Depth per Sample (20x default) -- will generate FILTER_MAX_TOTAL_DP automatically
| |
− | FILTER_MIN_SAMPLE_DP = 0.5 # Min Depth per Sample (0.5x defaul) -- will generate FILTER_MIN_TOTAL_DP automatically
| |
− | FILTER_ARGS = --write-vcf --filter --maxDP $(FILTER_MAX_TOTAL_DP) --minDP $(FILTER_MIN_TOTAL_DP) --maxAB 70 --maxSTR 20 --minSTR -20 --winIndel 5 --maxSTZ 5 --minSTZ -5 --maxAOI 5 # arguments for filtering (refer to vcfCooker for details)
| |
− | #############################################################################
| |
− | ## RELATIVE DIRECTORY UNDER OUT_DIR
| |
− | #############################################################################
| |
− | BAM_GLF_DIR = glfs/bams # BAM level GLF
| |
− | SM_GLF_DIR = glfs/samples # sample level GLF (after glfMerge if necessary)
| |
− | VCF_DIR = vcfs # unfiltered and filtered VCF
| |
− | PVCF_DIR = pvcfs # vcfPileup results
| |
− | SPLIT_DIR = split # chunks split to multiple overlappingpieces
| |
− | BEAGLE_DIR = beagle # beagle output
| |
− | THUNDER_DIR = thunder # MaCH/thunder output
| |
− | GLF_INDEX = glfIndex.ped # glfMultiples/glfExtract index file info
| |
− | #############################################################################
| |
− | ## OTHER OPTIONS
| |
− | #############################################################################
| |
− | UNIT_CHUNK = 5000000 # Chunk size of SNP calling : 5Mb is default
| |
− | LD_NSNPS = 10000 # Chunk size of genotype refinement : 10,000 SNPs
| |
− | LD_OVERLAP = 1000 # Overlapping # of SNPs between chinks : 1,000 SNPs
| |
− | RUN_INDEX_FORCE = FALSE # Regenerate BAM index file even if it exists
| |
− | MERGE_BEFORE_FILTER = FALSE # Merge across the chromosome before filtering
| |
− | NOBAQ_SUBSTRINGS = SOLID # Avoid BAQ if the BAM file contains the substring
| |
− | ASSERT_BAM_EXIST = FALSE # Check if BAM file exists
| |
− | #############################################################################
| |
− | ## CLUSTER SETTING : CURRENTLY COMPATIBLE WITH MOSIX PLATFORM
| |
− | #############################################################################
| |
− | MOS_PREFIX = # PREFIX FOR MOSIX COMMAND (BLANK IF UNUSED)
| |
− | MOS_NODES = # COMMA-SEPARATED LIST OF NODES TO SUBMIT JOBS
| |
− | REMOTE_PREFIX = # REMOTE_PREFIX : Set if cluster node see the directory differently (e.g. /net/mymachine/[original-dir])
| |
− | host8-223:umake-examples hmkang$ cat umake-example.conf | perl -lne 'chomp; print " $_"'
| |
| ################################################################## | | ################################################################## |
| # UMAKE CONFIGURATION FILE | | # UMAKE CONFIGURATION FILE |