Changes

UMAKE (view source)

Revision as of 00:43, 7 July 2011

17,716 bytes removed , 00:43, 7 July 2011

no edit summary

Line 70: Line 70:

* FILTER_ARGS needs to be carefully calibrated in order to obtain a good set of filtered set of SNPs

−

~~##################################################################~~

−

~~# UMAKE CONFIGURATION FILE~~

−

~~# This configuration file contains run-time configuration of~~

−

~~# UMAKE SNP calling pipeline~~

−

~~###############################################################################~~

−

~~## KEY ELEMENTS TO CONFIGURE : NEED TO MODIFY~~

−

~~###############################################################################~~

−

~~UMAKE_ROOT = FULL_PATH_TO_UMAKE ## e.g. /home/myid/code/umake~~

−

~~INPUT_ROOT = FULL_PATH_TO_CURRENT_DIR ## e.g. /home/myid/data/umake-examples~~

−

~~OUTPUT_ROOT = FULL_PATH_TO_OUTPUT_DIR ## e.g. /home/myid/data/umake-examples/out~~

−

~~BAM_INDEX = $(INPUT_ROOT)/umake-example.index # SAMPLE INDEX FILE (See documentation for detailed format)~~

−

~~CHRS = 20 # List of chromosomes to call SNPs. For multiple chromosomes, separate by whitespace~~

−

~~OUT_DIR = $(OUTPUT_ROOT) # output directory~~

−

~~OUT_PREFIX = umake-example # prefix of output Makefile $(OUT_PREFIX).Makefile will be generated~~

−

~~#PED_INDEX = $(INPUT_ROOT)/umake-example.ped # SAMPLE PED FILE (required only for chrX calling)~~

−

#

−

~~###############################################################################~~

−

~~## STEPS TO RUN : COMMENT OUT TO EXCLUDE CERTAIN STEPS~~

−

~~## --snpcall, --extract, --beagle, --thunder commands automatically set them~~

−

~~###############################################################################~~

−

~~#RUN_INDEX = TRUE # create BAM index file~~

−

~~#RUN_PILEUP = TRUE # create GLF file from BAM~~

−

~~#RUN_GLFMULTIPLES = TRUE # create unfiltered SNP calls~~

−

~~#RUN_VCFPILEUP = TRUE # create PVCF files using vcfPileup and run infoCollector~~

−

~~#RUN_FILTER = TRUE # filter SNPs using vcfCooker~~

−

~~#RUN_SPLIT = TRUE # split SNPs into chunks for genotype refinement~~

−

~~#RUN_BEAGLE = TRUE # BEAGLE - MUST SET AFTER FINISHING PREVIOUS STEPS~~

−

~~#RUN_SUBSET = TRUE # SUBSET FOR THUNDER - MAY BE SET WITH BEAGLE STEP TOGETHER~~

−

~~#RUN_THUNDER = TRUE # THUNDER - MUST SET AFTER FINISHING PREVIOUS STEPS~~

−

#

−

~~###############################################################################~~

−

~~## OPTIONS FOR GLFEXTRACT (GLFMULTIPLES, VCFPILEUP, FILTER MUST BE TURNED OFF)~~

−

~~###############################################################################~~

−

~~#RUN_EXTRACT = TRUE # Instead of discovering SNPs, extract genotype liklihood in the site of VCF_EXTRACT~~

−

~~#VCF_EXTRACT = # whole-genome (gzipped and tabixed) .vcf.gz file to extract the site information to genotype (such as 1000 Genomes site list)~~

−

#

−

~~###############################################################################~~

−

~~## OPTIONS FOR EXOME/TARGETED SEQUENCING : COMMENT OUT IF WHOLE GENOME SEQUENCING~~

−

~~###############################################################################~~

−

~~WRITE_TARGET_LOCI = TRUE # FOR TARGETED SEQUENCING ONLY -- Write loci file when performing pileup~~

−

~~UNIFORM_TARGET_BED = $(INPUT_ROOT)/umake-example.bed # Targeted sequencing : When all individuals has the same target. Otherwise, comment it out~~

−

~~OFFSET_OFF_TARGET = 50 # Extend target by given # of bases~~

−

~~MULTIPLE_TARGET_MAP = # Target per individual : Each line contains [SM_ID] [TARGET_BED]~~

−

~~TARGET_DIR = target # Directory to store target information~~

−

~~SAMTOOLS_VIEW_TARGET_ONLY = TRUE # When performing samtools view, exclude off-target regions (may make command line too long)~~

−

~~###############################################################################~~

−

~~## RESOURCE FILES : Download the full resources for full genome calling~~

−

~~###############################################################################~~

−

~~REF = $(INPUT_ROOT)/data/ref/human_g1k_v37_chr20.fa # Reference FASTA sequence. Note that the FASTA file in the example package is only chr20.~~

−

~~INDEL_PREFIX = $(INPUT_ROOT)/data/indels/1kg.pilot_release.merged.indels.sites.hg19 # 1000 Genomes Pilot 1 indel VCF prefix~~

−

~~DBSNP_PREFIX = $(INPUT_ROOT)/data/dbsnp/dbsnp_129_b37.rod # dbSNP file prefix~~

−

~~HM3_PREFIX = $(INPUT_ROOT)/data/HapMap/hapmap3_r3_b37_fwd.consensus.qc.poly # HapMap3 polymorphic site prefix~~

−

~~###############################################################################~~

−

~~## BINARIES~~

−

~~###############################################################################~~

−

~~SAMTOOLS_FOR_PILEUP = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools pileup~~

−

~~SAMTOOLS_FOR_OTHERS = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools view and calmd~~

−

~~GLFMERGE = $(UMAKE_ROOT)/bin/glfMerge # glfMerge when multiple BAMs exist per indvidual~~

−

~~GLFMULTIPLES = $(UMAKE_ROOT)/bin/glfMultiples --minMapQuality 0 --minDepth 1 --maxDepth 10000000 --uniformTsTv --smartFilter # glfMultiples and options~~

−

~~GLFEXTRACT = $(UMAKE_ROOT)/bin/glfExtract # glfExtract for obtaining VCF for known sites~~

−

~~VCFPILEUP = $(UMAKE_ROOT)/bin/vcfPileup # vcfPileup to generate rich per-site information~~

−

~~INFOCOLLECTOR = $(UMAKE_ROOT)/bin/infoCollector # create filtering statistics~~

−

~~VCFMERGE = perl $(UMAKE_ROOT)/scripts/bams2vcfMerge.pl # merge multiple BAMs separated by chunk of genomes~~

−

~~VCFCOOKER = $(UMAKE_ROOT)/bin/vcfCooker # vcfCooker for filtering~~

−

~~VCFSUMMARY = perl $(UMAKE_ROOT)/scripts/vcfSummary.pl # Get summary statistics of discovered site~~

−

~~VCFSPLIT = perl $(UMAKE_ROOT)/scripts/vcfSplit.pl # split VCF into overlapping chunks for genotype refinement~~

−

~~VCFPASTE = perl $(UMAKE_ROOT)/scripts/vcfPaste.pl # vcfPaste to generate filtered genotype VCF~~

−

BEAGLE = java -Xmx4g -jar $(UMAKE_ROOT)/ext/beagle.20101226.jar seed=993478 gprobs=true niterations=50 lowmem=true # BEAGLE BINARY : NEED TO COPY BEAGLE TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE

−

~~VCF2BEAGLE = perl $(UMAKE_ROOT)/scripts/vcf2Beagle.pl --PL # convert VCF (with PL tag) into beagle input~~

−

~~BEAGLE2VCF = perl $(UMAKE_ROOT)/scripts/beagle2Vcf.pl # convert beagle output to VCF~~

−

~~THUNDER = $(UMAKE_ROOT)/bin/thunderVCF -r 30 --phase --dosage --compact --inputPhased # MaCH/Thunder genotype refinement step~~

−

~~LIGATEVCF = perl $(UMAKE_ROOT)/scripts/ligateVcf.pl # ligate multiple phased VCFs while resolving the phase between VCFs~~

−

~~BGZIP = $(UMAKE_ROOT)/ext/bgzip # NEED TO COPY BGZIP TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE~~

−

~~TABIX = $(UMAKE_ROOT)/ext/tabix # NEED TO COPY TABIX TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE~~

−

~~###############################################################################~~

−

~~## ARGUMENT FOR FILTERING~~

−

~~###############################################################################~~

−

~~SAMTOOLS_VIEW_FILTER = -q 20 -F 0x0704 # samtools view filter (-q by MQ, -F by flag)~~

−

~~FILTER_MAX_SAMPLE_DP = 20 # Max Depth per Sample (20x default) -- will generate FILTER_MAX_TOTAL_DP automatically~~

−

~~FILTER_MIN_SAMPLE_DP = 0.5 # Min Depth per Sample (0.5x defaul) -- will generate FILTER_MIN_TOTAL_DP automatically~~

−

FILTER_ARGS = --write-vcf --filter --maxDP $(FILTER_MAX_TOTAL_DP) --minDP $(FILTER_MIN_TOTAL_DP) --maxAB 70 --maxSTR 20 --minSTR -20 --winIndel 5 --maxSTZ 5 --minSTZ -5 --maxAOI 5 # arguments for filtering (refer to vcfCooker for details)

−

~~#############################################################################~~

−

~~## RELATIVE DIRECTORY UNDER OUT_DIR~~

−

~~#############################################################################~~

−

~~BAM_GLF_DIR = glfs/bams # BAM level GLF~~

−

~~SM_GLF_DIR = glfs/samples # sample level GLF (after glfMerge if necessary)~~

−

~~VCF_DIR = vcfs # unfiltered and filtered VCF~~

−

~~PVCF_DIR = pvcfs # vcfPileup results~~

−

~~SPLIT_DIR = split # chunks split to multiple overlappingpieces~~

−

~~BEAGLE_DIR = beagle # beagle output~~

−

~~THUNDER_DIR = thunder # MaCH/thunder output~~

−

~~GLF_INDEX = glfIndex.ped # glfMultiples/glfExtract index file info~~

−

~~#############################################################################~~

−

~~## OTHER OPTIONS~~

−

~~#############################################################################~~

−

~~UNIT_CHUNK = 5000000 # Chunk size of SNP calling : 5Mb is default~~

−

~~LD_NSNPS = 10000 # Chunk size of genotype refinement : 10,000 SNPs~~

−

~~LD_OVERLAP = 1000 # Overlapping # of SNPs between chinks : 1,000 SNPs~~

−

~~RUN_INDEX_FORCE = FALSE # Regenerate BAM index file even if it exists~~

−

~~MERGE_BEFORE_FILTER = FALSE # Merge across the chromosome before filtering~~

−

~~NOBAQ_SUBSTRINGS = SOLID # Avoid BAQ if the BAM file contains the substring~~

−

~~ASSERT_BAM_EXIST = FALSE # Check if BAM file exists~~

−

~~#############################################################################~~

−

~~## CLUSTER SETTING : CURRENTLY COMPATIBLE WITH MOSIX PLATFORM~~

−

~~#############################################################################~~

−

~~MOS_PREFIX = # PREFIX FOR MOSIX COMMAND (BLANK IF UNUSED)~~

−

~~MOS_NODES = # COMMA-SEPARATED LIST OF NODES TO SUBMIT JOBS~~

−

~~REMOTE_PREFIX = # REMOTE_PREFIX : Set if cluster node see the directory differently (e.g. /net/mymachine/[original-dir]~~

−

~~host8-223:umake-examples hmkang$ cat umake-example.conf | perl -lne 'chomp; print " $_"'~~

−

~~##################################################################~~

−

~~# UMAKE CONFIGURATION FILE~~

−

~~# This configuration file contains run-time configuration of~~

−

~~# UMAKE SNP calling pipeline~~

−

~~###############################################################################~~

−

~~## KEY ELEMENTS TO CONFIGURE : NEED TO MODIFY~~

−

~~###############################################################################~~

−

~~#UMAKE_ROOT = FULL_PATH_TO_UMAKE ## e.g. /home/myid/code/umake~~

−

~~#INPUT_ROOT = FULL_PATH_TO_CURRENT_DIR ## e.g. /home/myid/data/umake-examples~~

−

~~#OUTPUT_ROOT = FULL_PATH_TO_OUTPUT_DIR ## e.g. /home/myid/data/umake-examples/out~~

−

~~BAM_INDEX = $(INPUT_ROOT)/umake-example.index # SAMPLE INDEX FILE (See documentation for detailed format)~~

−

~~CHRS = 20 # List of chromosomes to call SNPs. For multiple chromosomes, separate by whitespace~~

−

~~OUT_DIR = $(OUTPUT_ROOT) # output directory~~

−

~~OUT_PREFIX = umake-example # prefix of output Makefile $(OUT_PREFIX).Makefile will be generated~~

−

~~#PED_INDEX = $(INPUT_ROOT)/umake-example.ped # SAMPLE PED FILE (required only for chrX calling)~~

−

#

−

~~###############################################################################~~

−

~~## STEPS TO RUN : COMMENT OUT TO EXCLUDE CERTAIN STEPS~~

−

~~## --snpcall, --extract, --beagle, --thunder commands automatically set them~~

−

~~###############################################################################~~

−

~~#RUN_INDEX = TRUE # create BAM index file~~

−

~~#RUN_PILEUP = TRUE # create GLF file from BAM~~

−

~~#RUN_GLFMULTIPLES = TRUE # create unfiltered SNP calls~~

−

~~#RUN_VCFPILEUP = TRUE # create PVCF files using vcfPileup and run infoCollector~~

−

~~#RUN_FILTER = TRUE # filter SNPs using vcfCooker~~

−

~~#RUN_SPLIT = TRUE # split SNPs into chunks for genotype refinement~~

−

~~#RUN_BEAGLE = TRUE # BEAGLE - MUST SET AFTER FINISHING PREVIOUS STEPS~~

−

~~#RUN_SUBSET = TRUE # SUBSET FOR THUNDER - MAY BE SET WITH BEAGLE STEP TOGETHER~~

−

~~#RUN_THUNDER = TRUE # THUNDER - MUST SET AFTER FINISHING PREVIOUS STEPS~~

−

#

−

~~###############################################################################~~

−

~~## OPTIONS FOR GLFEXTRACT (GLFMULTIPLES, VCFPILEUP, FILTER MUST BE TURNED OFF)~~

−

~~###############################################################################~~

−

~~#RUN_EXTRACT = TRUE # Instead of discovering SNPs, extract genotype liklihood in the site of VCF_EXTRACT~~

−

~~#VCF_EXTRACT = # whole-genome (gzipped and tabixed) .vcf.gz file to extract the site information to genotype (such as 1000 Genomes site list)~~

−

#

−

~~###############################################################################~~

−

~~## OPTIONS FOR EXOME/TARGETED SEQUENCING : COMMENT OUT IF WHOLE GENOME SEQUENCING~~

−

~~###############################################################################~~

−

~~WRITE_TARGET_LOCI = TRUE # FOR TARGETED SEQUENCING ONLY -- Write loci file when performing pileup~~

−

~~UNIFORM_TARGET_BED = $(INPUT_ROOT)/umake-example.bed # Targeted sequencing : When all individuals has the same target. Otherwise, comment it out~~

−

~~OFFSET_OFF_TARGET = 50 # Extend target by given # of bases~~

−

~~MULTIPLE_TARGET_MAP = # Target per individual : Each line contains [SM_ID] [TARGET_BED]~~

−

~~TARGET_DIR = target # Directory to store target information~~

−

~~SAMTOOLS_VIEW_TARGET_ONLY = TRUE # When performing samtools view, exclude off-target regions (may make command line too long)~~

−

~~###############################################################################~~

−

~~## RESOURCE FILES : Download the full resources for full genome calling~~

−

~~###############################################################################~~

−

~~REF = $(INPUT_ROOT)/data/ref/human_g1k_v37_chr20.fa # Reference FASTA sequence. Note that the FASTA file in the example package is only chr20.~~

−

~~INDEL_PREFIX = $(INPUT_ROOT)/data/indels/1kg.pilot_release.merged.indels.sites.hg19 # 1000 Genomes Pilot 1 indel VCF prefix~~

−

~~DBSNP_PREFIX = $(INPUT_ROOT)/data/dbsnp/dbsnp_129_b37.rod # dbSNP file prefix~~

−

~~HM3_PREFIX = $(INPUT_ROOT)/data/HapMap/hapmap3_r3_b37_fwd.consensus.qc.poly # HapMap3 polymorphic site prefix~~

−

~~###############################################################################~~

−

~~## BINARIES~~

−

~~###############################################################################~~

−

~~SAMTOOLS_FOR_PILEUP = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools pileup~~

−

~~SAMTOOLS_FOR_OTHERS = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools view and calmd~~

−

~~GLFMERGE = $(UMAKE_ROOT)/bin/glfMerge # glfMerge when multiple BAMs exist per indvidual~~

−

~~GLFMULTIPLES = $(UMAKE_ROOT)/bin/glfMultiples --minMapQuality 0 --minDepth 1 --maxDepth 10000000 --uniformTsTv --smartFilter # glfMultiples and options~~

−

~~GLFEXTRACT = $(UMAKE_ROOT)/bin/glfExtract # glfExtract for obtaining VCF for known sites~~

−

~~VCFPILEUP = $(UMAKE_ROOT)/bin/vcfPileup # vcfPileup to generate rich per-site information~~

−

~~INFOCOLLECTOR = $(UMAKE_ROOT)/bin/infoCollector # create filtering statistics~~

−

~~VCFMERGE = perl $(UMAKE_ROOT)/scripts/bams2vcfMerge.pl # merge multiple BAMs separated by chunk of genomes~~

−

~~VCFCOOKER = $(UMAKE_ROOT)/bin/vcfCooker # vcfCooker for filtering~~

−

~~VCFSUMMARY = perl $(UMAKE_ROOT)/scripts/vcfSummary.pl # Get summary statistics of discovered site~~

−

~~VCFSPLIT = perl $(UMAKE_ROOT)/scripts/vcfSplit.pl # split VCF into overlapping chunks for genotype refinement~~

−

~~VCFPASTE = perl $(UMAKE_ROOT)/scripts/vcfPaste.pl # vcfPaste to generate filtered genotype VCF~~

−

BEAGLE = java -Xmx4g -jar $(UMAKE_ROOT)/ext/beagle.20101226.jar seed=993478 gprobs=true niterations=50 lowmem=true # BEAGLE BINARY : NEED TO COPY BEAGLE TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE

−

~~VCF2BEAGLE = perl $(UMAKE_ROOT)/scripts/vcf2Beagle.pl --PL # convert VCF (with PL tag) into beagle input~~

−

~~BEAGLE2VCF = perl $(UMAKE_ROOT)/scripts/beagle2Vcf.pl # convert beagle output to VCF~~

−

~~THUNDER = $(UMAKE_ROOT)/bin/thunderVCF -r 30 --phase --dosage --compact --inputPhased # MaCH/Thunder genotype refinement step~~

−

~~LIGATEVCF = perl $(UMAKE_ROOT)/scripts/ligateVcf.pl # ligate multiple phased VCFs while resolving the phase between VCFs~~

−

~~BGZIP = $(UMAKE_ROOT)/ext/bgzip # NEED TO COPY BGZIP TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE~~

−

~~TABIX = $(UMAKE_ROOT)/ext/tabix # NEED TO COPY TABIX TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE~~

−

~~###############################################################################~~

−

~~## ARGUMENT FOR FILTERING~~

−

~~###############################################################################~~

−

~~SAMTOOLS_VIEW_FILTER = -q 20 -F 0x0704 # samtools view filter (-q by MQ, -F by flag)~~

−

~~FILTER_MAX_SAMPLE_DP = 20 # Max Depth per Sample (20x default) -- will generate FILTER_MAX_TOTAL_DP automatically~~

−

~~FILTER_MIN_SAMPLE_DP = 0.5 # Min Depth per Sample (0.5x defaul) -- will generate FILTER_MIN_TOTAL_DP automatically~~

−

FILTER_ARGS = --write-vcf --filter --maxDP $(FILTER_MAX_TOTAL_DP) --minDP $(FILTER_MIN_TOTAL_DP) --maxAB 70 --maxSTR 20 --minSTR -20 --winIndel 5 --maxSTZ 5 --minSTZ -5 --maxAOI 5 # arguments for filtering (refer to vcfCooker for details)

−

~~#############################################################################~~

−

~~## RELATIVE DIRECTORY UNDER OUT_DIR~~

−

~~#############################################################################~~

−

~~BAM_GLF_DIR = glfs/bams # BAM level GLF~~

−

~~SM_GLF_DIR = glfs/samples # sample level GLF (after glfMerge if necessary)~~

−

~~VCF_DIR = vcfs # unfiltered and filtered VCF~~

−

~~PVCF_DIR = pvcfs # vcfPileup results~~

−

~~SPLIT_DIR = split # chunks split to multiple overlappingpieces~~

−

~~BEAGLE_DIR = beagle # beagle output~~

−

~~THUNDER_DIR = thunder # MaCH/thunder output~~

−

~~GLF_INDEX = glfIndex.ped # glfMultiples/glfExtract index file info~~

−

~~#############################################################################~~

−

~~## OTHER OPTIONS~~

−

~~#############################################################################~~

−

~~UNIT_CHUNK = 5000000 # Chunk size of SNP calling : 5Mb is default~~

−

~~LD_NSNPS = 10000 # Chunk size of genotype refinement : 10,000 SNPs~~

−

~~LD_OVERLAP = 1000 # Overlapping # of SNPs between chinks : 1,000 SNPs~~

−

~~RUN_INDEX_FORCE = FALSE # Regenerate BAM index file even if it exists~~

−

~~MERGE_BEFORE_FILTER = FALSE # Merge across the chromosome before filtering~~

−

~~NOBAQ_SUBSTRINGS = SOLID # Avoid BAQ if the BAM file contains the substring~~

−

~~ASSERT_BAM_EXIST = FALSE # Check if BAM file exists~~

−

~~#############################################################################~~

−

~~## CLUSTER SETTING : CURRENTLY COMPATIBLE WITH MOSIX PLATFORM~~

−

~~#############################################################################~~

−

~~MOS_PREFIX = # PREFIX FOR MOSIX COMMAND (BLANK IF UNUSED)~~

−

~~MOS_NODES = # COMMA-SEPARATED LIST OF NODES TO SUBMIT JOBS~~

−

~~REMOTE_PREFIX = # REMOTE_PREFIX : Set if cluster node see the directory differently (e.g. /net/mymachine/[original-dir])~~

−

~~host8-223:umake-examples hmkang$ cat umake-example.conf | perl -lne 'chomp; print " $_"'~~

##################################################################

# UMAKE CONFIGURATION FILE

Hmkang

Administrators

1,120

edits

Changes

UMAKE (view source)

Revision as of 00:43, 7 July 2011

Navigation menu

Page actions

Page actions

Personal tools

quick links

teaching

Navigation

Search

Tools