Changes

UMAKE (view source)

Revision as of 00:35, 7 July 2011

27,240 bytes added , 00:35, 7 July 2011

no edit summary

Line 10: Line 10:

To build UMAKE, download the UMAKE package from the link above and run the following series of commands.

−

tar xzvf umake.v1.~~20110705~~.tar.gz

+

tar xzvf umake.v1.0.1.20110706.tar.gz

cd umake

make

Line 58: Line 58:

* BAM files need to be duplicate-marked and base-quality recalibrated in order to obtain high quality SNP calls.

* Each line of Index file represents each individual under the following format. Note that multiple BAMs per individual may be provided.

−

[SAMPLE_ID] [COMMA SEPARATED POPULATION LABELS] [BAM_FILE1] [BAM_FILE2] ...

+

[SAMPLE_ID] [COMMA SEPARATED POPULATION LABELS] [BAM_FILE1] [BAM_FILE2] ...

* Additional input Files including Pedigree files (PED format) (to specify gender information in chrX calling), Target information (UCSC's BED format) in targeted or whole exome capture sequencing may be provided.

* Configuration file contains core information of run-time options including the software binaries and command line arguments. Refer to the example configuration file for further information

+

== Configuration File ==

+

The example configuration file below illustrate how to configure the UMAKE configuration file. Here are a few highlights

+

* Steps to run could be automatically set using --snpcall, --beagle, --thunder, --extract, or manually set by uncommenting options in STEPS_TO_RUN. Note that the steps to run should be in a consecutive order

+

* To run on full genome data, the resource file should be download at [[ftp://share.sph.umich.edu/1000genomes/umake-resources/ | FTP Download of Full Resource Files]].

+

* You need to uncomment the target-related configuration lines in order to run in whole genome data

+

* FILTER_ARGS needs to be carefully calibrated in order to obtain a good set of filtered set of SNPs

+

##################################################################

+

# UMAKE CONFIGURATION FILE

+

# This configuration file contains run-time configuration of

+

# UMAKE SNP calling pipeline

+

###############################################################################

+

## KEY ELEMENTS TO CONFIGURE : NEED TO MODIFY

+

###############################################################################

+

UMAKE_ROOT = FULL_PATH_TO_UMAKE ## e.g. /home/myid/code/umake

+

INPUT_ROOT = FULL_PATH_TO_CURRENT_DIR ## e.g. /home/myid/data/umake-examples

+

OUTPUT_ROOT = FULL_PATH_TO_OUTPUT_DIR ## e.g. /home/myid/data/umake-examples/out

+

BAM_INDEX = $(INPUT_ROOT)/umake-example.index # SAMPLE INDEX FILE (See documentation for detailed format)

+

CHRS = 20 # List of chromosomes to call SNPs. For multiple chromosomes, separate by whitespace

+

OUT_DIR = $(OUTPUT_ROOT) # output directory

+

OUT_PREFIX = umake-example # prefix of output Makefile $(OUT_PREFIX).Makefile will be generated

+

#PED_INDEX = $(INPUT_ROOT)/umake-example.ped # SAMPLE PED FILE (required only for chrX calling)

+

#

+

###############################################################################

+

## STEPS TO RUN : COMMENT OUT TO EXCLUDE CERTAIN STEPS

+

## --snpcall, --extract, --beagle, --thunder commands automatically set them

+

###############################################################################

+

#RUN_INDEX = TRUE # create BAM index file

+

#RUN_PILEUP = TRUE # create GLF file from BAM

+

#RUN_GLFMULTIPLES = TRUE # create unfiltered SNP calls

+

#RUN_VCFPILEUP = TRUE # create PVCF files using vcfPileup and run infoCollector

+

#RUN_FILTER = TRUE # filter SNPs using vcfCooker

+

#RUN_SPLIT = TRUE # split SNPs into chunks for genotype refinement

+

#RUN_BEAGLE = TRUE # BEAGLE - MUST SET AFTER FINISHING PREVIOUS STEPS

+

#RUN_SUBSET = TRUE # SUBSET FOR THUNDER - MAY BE SET WITH BEAGLE STEP TOGETHER

+

#RUN_THUNDER = TRUE # THUNDER - MUST SET AFTER FINISHING PREVIOUS STEPS

+

#

+

###############################################################################

+

## OPTIONS FOR GLFEXTRACT (GLFMULTIPLES, VCFPILEUP, FILTER MUST BE TURNED OFF)

+

###############################################################################

+

#RUN_EXTRACT = TRUE # Instead of discovering SNPs, extract genotype liklihood in the site of VCF_EXTRACT

+

#VCF_EXTRACT = # whole-genome (gzipped and tabixed) .vcf.gz file to extract the site information to genotype (such as 1000 Genomes site list)

+

#

+

###############################################################################

+

## OPTIONS FOR EXOME/TARGETED SEQUENCING : COMMENT OUT IF WHOLE GENOME SEQUENCING

+

###############################################################################

+

WRITE_TARGET_LOCI = TRUE # FOR TARGETED SEQUENCING ONLY -- Write loci file when performing pileup

+

UNIFORM_TARGET_BED = $(INPUT_ROOT)/umake-example.bed # Targeted sequencing : When all individuals has the same target. Otherwise, comment it out

+

OFFSET_OFF_TARGET = 50 # Extend target by given # of bases

+

MULTIPLE_TARGET_MAP = # Target per individual : Each line contains [SM_ID] [TARGET_BED]

+

TARGET_DIR = target # Directory to store target information

+

SAMTOOLS_VIEW_TARGET_ONLY = TRUE # When performing samtools view, exclude off-target regions (may make command line too long)

+

###############################################################################

+

## RESOURCE FILES : Download the full resources for full genome calling

+

###############################################################################

+

REF = $(INPUT_ROOT)/data/ref/human_g1k_v37_chr20.fa # Reference FASTA sequence. Note that the FASTA file in the example package is only chr20.

+

INDEL_PREFIX = $(INPUT_ROOT)/data/indels/1kg.pilot_release.merged.indels.sites.hg19 # 1000 Genomes Pilot 1 indel VCF prefix

+

DBSNP_PREFIX = $(INPUT_ROOT)/data/dbsnp/dbsnp_129_b37.rod # dbSNP file prefix

+

HM3_PREFIX = $(INPUT_ROOT)/data/HapMap/hapmap3_r3_b37_fwd.consensus.qc.poly # HapMap3 polymorphic site prefix

+

###############################################################################

+

## BINARIES

+

###############################################################################

+

SAMTOOLS_FOR_PILEUP = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools pileup

+

SAMTOOLS_FOR_OTHERS = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools view and calmd

+

GLFMERGE = $(UMAKE_ROOT)/bin/glfMerge # glfMerge when multiple BAMs exist per indvidual

+

GLFMULTIPLES = $(UMAKE_ROOT)/bin/glfMultiples --minMapQuality 0 --minDepth 1 --maxDepth 10000000 --uniformTsTv --smartFilter # glfMultiples and options

+

GLFEXTRACT = $(UMAKE_ROOT)/bin/glfExtract # glfExtract for obtaining VCF for known sites

+

VCFPILEUP = $(UMAKE_ROOT)/bin/vcfPileup # vcfPileup to generate rich per-site information

+

INFOCOLLECTOR = $(UMAKE_ROOT)/bin/infoCollector # create filtering statistics

+

VCFMERGE = perl $(UMAKE_ROOT)/scripts/bams2vcfMerge.pl # merge multiple BAMs separated by chunk of genomes

+

VCFCOOKER = $(UMAKE_ROOT)/bin/vcfCooker # vcfCooker for filtering

+

VCFSUMMARY = perl $(UMAKE_ROOT)/scripts/vcfSummary.pl # Get summary statistics of discovered site

+

VCFSPLIT = perl $(UMAKE_ROOT)/scripts/vcfSplit.pl # split VCF into overlapping chunks for genotype refinement

+

VCFPASTE = perl $(UMAKE_ROOT)/scripts/vcfPaste.pl # vcfPaste to generate filtered genotype VCF

+

BEAGLE = java -Xmx4g -jar $(UMAKE_ROOT)/ext/beagle.20101226.jar seed=993478 gprobs=true niterations=50 lowmem=true # BEAGLE BINARY : NEED TO COPY BEAGLE TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE

+

VCF2BEAGLE = perl $(UMAKE_ROOT)/scripts/vcf2Beagle.pl --PL # convert VCF (with PL tag) into beagle input

+

BEAGLE2VCF = perl $(UMAKE_ROOT)/scripts/beagle2Vcf.pl # convert beagle output to VCF

+

THUNDER = $(UMAKE_ROOT)/bin/thunderVCF -r 30 --phase --dosage --compact --inputPhased # MaCH/Thunder genotype refinement step

+

LIGATEVCF = perl $(UMAKE_ROOT)/scripts/ligateVcf.pl # ligate multiple phased VCFs while resolving the phase between VCFs

+

BGZIP = $(UMAKE_ROOT)/ext/bgzip # NEED TO COPY BGZIP TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE

+

TABIX = $(UMAKE_ROOT)/ext/tabix # NEED TO COPY TABIX TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE

+

###############################################################################

+

## ARGUMENT FOR FILTERING

+

###############################################################################

+

SAMTOOLS_VIEW_FILTER = -q 20 -F 0x0704 # samtools view filter (-q by MQ, -F by flag)

+

FILTER_MAX_SAMPLE_DP = 20 # Max Depth per Sample (20x default) -- will generate FILTER_MAX_TOTAL_DP automatically

+

FILTER_MIN_SAMPLE_DP = 0.5 # Min Depth per Sample (0.5x defaul) -- will generate FILTER_MIN_TOTAL_DP automatically

+

FILTER_ARGS = --write-vcf --filter --maxDP $(FILTER_MAX_TOTAL_DP) --minDP $(FILTER_MIN_TOTAL_DP) --maxAB 70 --maxSTR 20 --minSTR -20 --winIndel 5 --maxSTZ 5 --minSTZ -5 --maxAOI 5 # arguments for filtering (refer to vcfCooker for details)

+

#############################################################################

+

## RELATIVE DIRECTORY UNDER OUT_DIR

+

#############################################################################

+

BAM_GLF_DIR = glfs/bams # BAM level GLF

+

SM_GLF_DIR = glfs/samples # sample level GLF (after glfMerge if necessary)

+

VCF_DIR = vcfs # unfiltered and filtered VCF

+

PVCF_DIR = pvcfs # vcfPileup results

+

SPLIT_DIR = split # chunks split to multiple overlappingpieces

+

BEAGLE_DIR = beagle # beagle output

+

THUNDER_DIR = thunder # MaCH/thunder output

+

GLF_INDEX = glfIndex.ped # glfMultiples/glfExtract index file info

+

#############################################################################

+

## OTHER OPTIONS

+

#############################################################################

+

UNIT_CHUNK = 5000000 # Chunk size of SNP calling : 5Mb is default

+

LD_NSNPS = 10000 # Chunk size of genotype refinement : 10,000 SNPs

+

LD_OVERLAP = 1000 # Overlapping # of SNPs between chinks : 1,000 SNPs

+

RUN_INDEX_FORCE = FALSE # Regenerate BAM index file even if it exists

+

MERGE_BEFORE_FILTER = FALSE # Merge across the chromosome before filtering

+

NOBAQ_SUBSTRINGS = SOLID # Avoid BAQ if the BAM file contains the substring

+

ASSERT_BAM_EXIST = FALSE # Check if BAM file exists

+

#############################################################################

+

## CLUSTER SETTING : CURRENTLY COMPATIBLE WITH MOSIX PLATFORM

+

#############################################################################

+

MOS_PREFIX = # PREFIX FOR MOSIX COMMAND (BLANK IF UNUSED)

+

MOS_NODES = # COMMA-SEPARATED LIST OF NODES TO SUBMIT JOBS

+

REMOTE_PREFIX = # REMOTE_PREFIX : Set if cluster node see the directory differently (e.g. /net/mymachine/[original-dir]

+

host8-223:umake-examples hmkang$ cat umake-example.conf | perl -lne 'chomp; print " $_"'

+

##################################################################

+

# UMAKE CONFIGURATION FILE

+

# This configuration file contains run-time configuration of

+

# UMAKE SNP calling pipeline

+

###############################################################################

+

## KEY ELEMENTS TO CONFIGURE : NEED TO MODIFY

+

###############################################################################

+

#UMAKE_ROOT = FULL_PATH_TO_UMAKE ## e.g. /home/myid/code/umake

+

#INPUT_ROOT = FULL_PATH_TO_CURRENT_DIR ## e.g. /home/myid/data/umake-examples

+

#OUTPUT_ROOT = FULL_PATH_TO_OUTPUT_DIR ## e.g. /home/myid/data/umake-examples/out

+

BAM_INDEX = $(INPUT_ROOT)/umake-example.index # SAMPLE INDEX FILE (See documentation for detailed format)

+

CHRS = 20 # List of chromosomes to call SNPs. For multiple chromosomes, separate by whitespace

+

OUT_DIR = $(OUTPUT_ROOT) # output directory

+

OUT_PREFIX = umake-example # prefix of output Makefile $(OUT_PREFIX).Makefile will be generated

+

#PED_INDEX = $(INPUT_ROOT)/umake-example.ped # SAMPLE PED FILE (required only for chrX calling)

+

#

+

###############################################################################

+

## STEPS TO RUN : COMMENT OUT TO EXCLUDE CERTAIN STEPS

+

## --snpcall, --extract, --beagle, --thunder commands automatically set them

+

###############################################################################

+

#RUN_INDEX = TRUE # create BAM index file

+

#RUN_PILEUP = TRUE # create GLF file from BAM

+

#RUN_GLFMULTIPLES = TRUE # create unfiltered SNP calls

+

#RUN_VCFPILEUP = TRUE # create PVCF files using vcfPileup and run infoCollector

+

#RUN_FILTER = TRUE # filter SNPs using vcfCooker

+

#RUN_SPLIT = TRUE # split SNPs into chunks for genotype refinement

+

#RUN_BEAGLE = TRUE # BEAGLE - MUST SET AFTER FINISHING PREVIOUS STEPS

+

#RUN_SUBSET = TRUE # SUBSET FOR THUNDER - MAY BE SET WITH BEAGLE STEP TOGETHER

+

#RUN_THUNDER = TRUE # THUNDER - MUST SET AFTER FINISHING PREVIOUS STEPS

+

#

+

###############################################################################

+

## OPTIONS FOR GLFEXTRACT (GLFMULTIPLES, VCFPILEUP, FILTER MUST BE TURNED OFF)

+

###############################################################################

+

#RUN_EXTRACT = TRUE # Instead of discovering SNPs, extract genotype liklihood in the site of VCF_EXTRACT

+

#VCF_EXTRACT = # whole-genome (gzipped and tabixed) .vcf.gz file to extract the site information to genotype (such as 1000 Genomes site list)

+

#

+

###############################################################################

+

## OPTIONS FOR EXOME/TARGETED SEQUENCING : COMMENT OUT IF WHOLE GENOME SEQUENCING

+

###############################################################################

+

WRITE_TARGET_LOCI = TRUE # FOR TARGETED SEQUENCING ONLY -- Write loci file when performing pileup

+

UNIFORM_TARGET_BED = $(INPUT_ROOT)/umake-example.bed # Targeted sequencing : When all individuals has the same target. Otherwise, comment it out

+

OFFSET_OFF_TARGET = 50 # Extend target by given # of bases

+

MULTIPLE_TARGET_MAP = # Target per individual : Each line contains [SM_ID] [TARGET_BED]

+

TARGET_DIR = target # Directory to store target information

+

SAMTOOLS_VIEW_TARGET_ONLY = TRUE # When performing samtools view, exclude off-target regions (may make command line too long)

+

###############################################################################

+

## RESOURCE FILES : Download the full resources for full genome calling

+

###############################################################################

+

REF = $(INPUT_ROOT)/data/ref/human_g1k_v37_chr20.fa # Reference FASTA sequence. Note that the FASTA file in the example package is only chr20.

+

INDEL_PREFIX = $(INPUT_ROOT)/data/indels/1kg.pilot_release.merged.indels.sites.hg19 # 1000 Genomes Pilot 1 indel VCF prefix

+

DBSNP_PREFIX = $(INPUT_ROOT)/data/dbsnp/dbsnp_129_b37.rod # dbSNP file prefix

+

HM3_PREFIX = $(INPUT_ROOT)/data/HapMap/hapmap3_r3_b37_fwd.consensus.qc.poly # HapMap3 polymorphic site prefix

+

###############################################################################

+

## BINARIES

+

###############################################################################

+

SAMTOOLS_FOR_PILEUP = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools pileup

+

SAMTOOLS_FOR_OTHERS = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools view and calmd

+

GLFMERGE = $(UMAKE_ROOT)/bin/glfMerge # glfMerge when multiple BAMs exist per indvidual

+

GLFMULTIPLES = $(UMAKE_ROOT)/bin/glfMultiples --minMapQuality 0 --minDepth 1 --maxDepth 10000000 --uniformTsTv --smartFilter # glfMultiples and options

+

GLFEXTRACT = $(UMAKE_ROOT)/bin/glfExtract # glfExtract for obtaining VCF for known sites

+

VCFPILEUP = $(UMAKE_ROOT)/bin/vcfPileup # vcfPileup to generate rich per-site information

+

INFOCOLLECTOR = $(UMAKE_ROOT)/bin/infoCollector # create filtering statistics

+

VCFMERGE = perl $(UMAKE_ROOT)/scripts/bams2vcfMerge.pl # merge multiple BAMs separated by chunk of genomes

+

VCFCOOKER = $(UMAKE_ROOT)/bin/vcfCooker # vcfCooker for filtering

+

VCFSUMMARY = perl $(UMAKE_ROOT)/scripts/vcfSummary.pl # Get summary statistics of discovered site

+

VCFSPLIT = perl $(UMAKE_ROOT)/scripts/vcfSplit.pl # split VCF into overlapping chunks for genotype refinement

+

VCFPASTE = perl $(UMAKE_ROOT)/scripts/vcfPaste.pl # vcfPaste to generate filtered genotype VCF

+

BEAGLE = java -Xmx4g -jar $(UMAKE_ROOT)/ext/beagle.20101226.jar seed=993478 gprobs=true niterations=50 lowmem=true # BEAGLE BINARY : NEED TO COPY BEAGLE TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE

+

VCF2BEAGLE = perl $(UMAKE_ROOT)/scripts/vcf2Beagle.pl --PL # convert VCF (with PL tag) into beagle input

+

BEAGLE2VCF = perl $(UMAKE_ROOT)/scripts/beagle2Vcf.pl # convert beagle output to VCF

+

THUNDER = $(UMAKE_ROOT)/bin/thunderVCF -r 30 --phase --dosage --compact --inputPhased # MaCH/Thunder genotype refinement step

+

LIGATEVCF = perl $(UMAKE_ROOT)/scripts/ligateVcf.pl # ligate multiple phased VCFs while resolving the phase between VCFs

+

BGZIP = $(UMAKE_ROOT)/ext/bgzip # NEED TO COPY BGZIP TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE

+

TABIX = $(UMAKE_ROOT)/ext/tabix # NEED TO COPY TABIX TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE

+

###############################################################################

+

## ARGUMENT FOR FILTERING

+

###############################################################################

+

SAMTOOLS_VIEW_FILTER = -q 20 -F 0x0704 # samtools view filter (-q by MQ, -F by flag)

+

FILTER_MAX_SAMPLE_DP = 20 # Max Depth per Sample (20x default) -- will generate FILTER_MAX_TOTAL_DP automatically

+

FILTER_MIN_SAMPLE_DP = 0.5 # Min Depth per Sample (0.5x defaul) -- will generate FILTER_MIN_TOTAL_DP automatically

+

FILTER_ARGS = --write-vcf --filter --maxDP $(FILTER_MAX_TOTAL_DP) --minDP $(FILTER_MIN_TOTAL_DP) --maxAB 70 --maxSTR 20 --minSTR -20 --winIndel 5 --maxSTZ 5 --minSTZ -5 --maxAOI 5 # arguments for filtering (refer to vcfCooker for details)

+

#############################################################################

+

## RELATIVE DIRECTORY UNDER OUT_DIR

+

#############################################################################

+

BAM_GLF_DIR = glfs/bams # BAM level GLF

+

SM_GLF_DIR = glfs/samples # sample level GLF (after glfMerge if necessary)

+

VCF_DIR = vcfs # unfiltered and filtered VCF

+

PVCF_DIR = pvcfs # vcfPileup results

+

SPLIT_DIR = split # chunks split to multiple overlappingpieces

+

BEAGLE_DIR = beagle # beagle output

+

THUNDER_DIR = thunder # MaCH/thunder output

+

GLF_INDEX = glfIndex.ped # glfMultiples/glfExtract index file info

+

#############################################################################

+

## OTHER OPTIONS

+

#############################################################################

+

UNIT_CHUNK = 5000000 # Chunk size of SNP calling : 5Mb is default

+

LD_NSNPS = 10000 # Chunk size of genotype refinement : 10,000 SNPs

+

LD_OVERLAP = 1000 # Overlapping # of SNPs between chinks : 1,000 SNPs

+

RUN_INDEX_FORCE = FALSE # Regenerate BAM index file even if it exists

+

MERGE_BEFORE_FILTER = FALSE # Merge across the chromosome before filtering

+

NOBAQ_SUBSTRINGS = SOLID # Avoid BAQ if the BAM file contains the substring

+

ASSERT_BAM_EXIST = FALSE # Check if BAM file exists

+

#############################################################################

+

## CLUSTER SETTING : CURRENTLY COMPATIBLE WITH MOSIX PLATFORM

+

#############################################################################

+

MOS_PREFIX = # PREFIX FOR MOSIX COMMAND (BLANK IF UNUSED)

+

MOS_NODES = # COMMA-SEPARATED LIST OF NODES TO SUBMIT JOBS

+

REMOTE_PREFIX = # REMOTE_PREFIX : Set if cluster node see the directory differently (e.g. /net/mymachine/[original-dir])

+

host8-223:umake-examples hmkang$ cat umake-example.conf | perl -lne 'chomp; print " $_"'

+

##################################################################

+

# UMAKE CONFIGURATION FILE

+

# This configuration file contains run-time configuration of

+

# UMAKE SNP calling pipeline

+

###############################################################################

+

## KEY ELEMENTS TO CONFIGURE : NEED TO MODIFY

+

###############################################################################

+

#UMAKE_ROOT = FULL_PATH_TO_UMAKE ## e.g. /home/myid/code/umake

+

#INPUT_ROOT = FULL_PATH_TO_CURRENT_DIR ## e.g. /home/myid/data/umake-examples

+

#OUTPUT_ROOT = FULL_PATH_TO_OUTPUT_DIR ## e.g. /home/myid/data/umake-examples/out

+

BAM_INDEX = $(INPUT_ROOT)/umake-example.index # SAMPLE INDEX FILE (See documentation for detailed format)

+

CHRS = 20 # List of chromosomes to call SNPs. For multiple chromosomes, separate by whitespace

+

OUT_DIR = $(OUTPUT_ROOT) # output directory

+

OUT_PREFIX = umake-example # prefix of output Makefile $(OUT_PREFIX).Makefile will be generated

+

#PED_INDEX = $(INPUT_ROOT)/umake-example.ped # SAMPLE PED FILE (required only for chrX calling)

+

#

+

###############################################################################

+

## STEPS TO RUN : COMMENT OUT TO EXCLUDE CERTAIN STEPS

+

## --snpcall, --extract, --beagle, --thunder commands automatically set them

+

###############################################################################

+

#RUN_INDEX = TRUE # create BAM index file

+

#RUN_PILEUP = TRUE # create GLF file from BAM

+

#RUN_GLFMULTIPLES = TRUE # create unfiltered SNP calls

+

#RUN_VCFPILEUP = TRUE # create PVCF files using vcfPileup and run infoCollector

+

#RUN_FILTER = TRUE # filter SNPs using vcfCooker

+

#RUN_SPLIT = TRUE # split SNPs into chunks for genotype refinement

+

#RUN_BEAGLE = TRUE # BEAGLE - MUST SET AFTER FINISHING PREVIOUS STEPS

+

#RUN_SUBSET = TRUE # SUBSET FOR THUNDER - MAY BE SET WITH BEAGLE STEP TOGETHER

+

#RUN_THUNDER = TRUE # THUNDER - MUST SET AFTER FINISHING PREVIOUS STEPS

+

#

+

###############################################################################

+

## OPTIONS FOR GLFEXTRACT (GLFMULTIPLES, VCFPILEUP, FILTER MUST BE TURNED OFF)

+

###############################################################################

+

#RUN_EXTRACT = TRUE # Instead of discovering SNPs, extract genotype liklihood in the site of VCF_EXTRACT

+

#VCF_EXTRACT = # whole-genome (gzipped and tabixed) .vcf.gz file to extract the site information to genotype (such as 1000 Genomes site list)

+

#

+

###############################################################################

+

## OPTIONS FOR EXOME/TARGETED SEQUENCING : COMMENT OUT IF WHOLE GENOME SEQUENCING

+

###############################################################################

+

WRITE_TARGET_LOCI = TRUE # FOR TARGETED SEQUENCING ONLY -- Write loci file when performing pileup

+

UNIFORM_TARGET_BED = $(INPUT_ROOT)/umake-example.bed # Targeted sequencing : When all individuals has the same target. Otherwise, comment it out

+

OFFSET_OFF_TARGET = 50 # Extend target by given # of bases

+

MULTIPLE_TARGET_MAP = # Target per individual : Each line contains [SM_ID] [TARGET_BED]

+

TARGET_DIR = target # Directory to store target information

+

SAMTOOLS_VIEW_TARGET_ONLY = TRUE # When performing samtools view, exclude off-target regions (may make command line too long)

+

#

+

###############################################################################

+

## RESOURCE FILES : Download the full resources for full genome calling

+

###############################################################################

+

REF = $(INPUT_ROOT)/data/ref/human_g1k_v37_chr20.fa # Reference FASTA sequence. Note that the FASTA file in the example package is only chr20.

+

INDEL_PREFIX = $(INPUT_ROOT)/data/indels/1kg.pilot_release.merged.indels.sites.hg19 # 1000 Genomes Pilot 1 indel VCF prefix

+

DBSNP_PREFIX = $(INPUT_ROOT)/data/dbsnp/dbsnp_129_b37.rod # dbSNP file prefix

+

HM3_PREFIX = $(INPUT_ROOT)/data/HapMap/hapmap3_r3_b37_fwd.consensus.qc.poly # HapMap3 polymorphic site prefix

+

#

+

###############################################################################

+

## BINARIES

+

###############################################################################

+

SAMTOOLS_FOR_PILEUP = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools pileup

+

SAMTOOLS_FOR_OTHERS = $(UMAKE_ROOT)/bin/samtools-hybrid # for samtools view and calmd

+

GLFMERGE = $(UMAKE_ROOT)/bin/glfMerge # glfMerge when multiple BAMs exist per indvidual

+

GLFMULTIPLES = $(UMAKE_ROOT)/bin/glfMultiples --minMapQuality 0 --minDepth 1 --maxDepth 10000000 --uniformTsTv --smartFilter # glfMultiples and options

+

GLFEXTRACT = $(UMAKE_ROOT)/bin/glfExtract # glfExtract for obtaining VCF for known sites

+

VCFPILEUP = $(UMAKE_ROOT)/bin/vcfPileup # vcfPileup to generate rich per-site information

+

INFOCOLLECTOR = $(UMAKE_ROOT)/bin/infoCollector # create filtering statistics

+

VCFMERGE = perl $(UMAKE_ROOT)/scripts/bams2vcfMerge.pl # merge multiple BAMs separated by chunk of genomes

+

VCFCOOKER = $(UMAKE_ROOT)/bin/vcfCooker # vcfCooker for filtering

+

VCFSUMMARY = perl $(UMAKE_ROOT)/scripts/vcfSummary.pl # Get summary statistics of discovered site

+

VCFSPLIT = perl $(UMAKE_ROOT)/scripts/vcfSplit.pl # split VCF into overlapping chunks for genotype refinement

+

VCFPASTE = perl $(UMAKE_ROOT)/scripts/vcfPaste.pl # vcfPaste to generate filtered genotype VCF

+

BEAGLE = java -Xmx4g -jar $(UMAKE_ROOT)/ext/beagle.20101226.jar seed=993478 gprobs=true niterations=50 lowmem=true # BEAGLE BINARY : NEED TO COPY BEAGLE TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE

+

VCF2BEAGLE = perl $(UMAKE_ROOT)/scripts/vcf2Beagle.pl --PL # convert VCF (with PL tag) into beagle input

+

BEAGLE2VCF = perl $(UMAKE_ROOT)/scripts/beagle2Vcf.pl # convert beagle output to VCF

+

THUNDER = $(UMAKE_ROOT)/bin/thunderVCF -r 30 --phase --dosage --compact --inputPhased # MaCH/Thunder genotype refinement step

+

LIGATEVCF = perl $(UMAKE_ROOT)/scripts/ligateVcf.pl # ligate multiple phased VCFs while resolving the phase between VCFs

+

BGZIP = $(UMAKE_ROOT)/ext/bgzip # NEED TO COPY BGZIP TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE

+

TABIX = $(UMAKE_ROOT)/ext/tabix # NEED TO COPY TABIX TO $(UMAKE_ROOT)/ext DIRECTORY BEFORE RUNNING PIPELINE

+

#

+

###############################################################################

+

## ARGUMENT FOR FILTERING

+

###############################################################################

+

SAMTOOLS_VIEW_FILTER = -q 20 -F 0x0704 # samtools view filter (-q by MQ, -F by flag)

+

FILTER_MAX_SAMPLE_DP = 20 # Max Depth per Sample (20x default) -- will generate FILTER_MAX_TOTAL_DP automatically

+

FILTER_MIN_SAMPLE_DP = 0.5 # Min Depth per Sample (0.5x defaul) -- will generate FILTER_MIN_TOTAL_DP automatically

+

FILTER_ARGS = --write-vcf --filter --maxDP $(FILTER_MAX_TOTAL_DP) --minDP $(FILTER_MIN_TOTAL_DP) --maxAB 70 --maxSTR 20 --minSTR -20 --winIndel 5 --maxSTZ 5 --minSTZ -5 --maxAOI 5 # arguments for filtering (refer to vcfCooker for details)

+

#

+

#############################################################################

+

## RELATIVE DIRECTORY UNDER OUT_DIR

+

#############################################################################

+

BAM_GLF_DIR = glfs/bams # BAM level GLF

+

SM_GLF_DIR = glfs/samples # sample level GLF (after glfMerge if necessary)

+

VCF_DIR = vcfs # unfiltered and filtered VCF

+

PVCF_DIR = pvcfs # vcfPileup results

+

SPLIT_DIR = split # chunks split to multiple overlappingpieces

+

BEAGLE_DIR = beagle # beagle output

+

THUNDER_DIR = thunder # MaCH/thunder output

+

GLF_INDEX = glfIndex.ped # glfMultiples/glfExtract index file info

+

#

+

#############################################################################

+

## OTHER OPTIONS

+

#############################################################################

+

UNIT_CHUNK = 5000000 # Chunk size of SNP calling : 5Mb is default

+

LD_NSNPS = 10000 # Chunk size of genotype refinement : 10,000 SNPs

+

LD_OVERLAP = 1000 # Overlapping # of SNPs between chinks : 1,000 SNPs

+

RUN_INDEX_FORCE = FALSE # Regenerate BAM index file even if it exists

+

MERGE_BEFORE_FILTER = FALSE # Merge across the chromosome before filtering

+

NOBAQ_SUBSTRINGS = SOLID # Avoid BAQ if the BAM file contains the substring

+

ASSERT_BAM_EXIST = FALSE # Check if BAM file exists

+

#

+

#############################################################################

+

## CLUSTER SETTING : CURRENTLY COMPATIBLE WITH MOSIX PLATFORM

+

#############################################################################

+

MOS_PREFIX = # PREFIX FOR MOSIX COMMAND (BLANK IF UNUSED)

+

MOS_NODES = # COMMA-SEPARATED LIST OF NODES TO SUBMIT JOBS

+

REMOTE_PREFIX = # REMOTE_PREFIX : Set if cluster node see the directory differently (e.g. /net/mymachine/[original-dir])

+

== Software Components ==

Hmkang

Administrators

1,120

edits

Changes

UMAKE (view source)

Revision as of 00:35, 7 July 2011

Navigation menu

Page actions

Page actions

Personal tools

quick links

teaching

Navigation

Search

Tools