Changes

From Genome Analysis Wiki
Jump to navigationJump to search
3,422 bytes added ,  11:58, 12 June 2015
Line 1: Line 1:  
= Introduction =
 
= Introduction =
   −
GNU Makefile is a widely used tool for managing the complicated process of compiling a C program.
+
GNU Make is often thought of as a tool for managing the compilation of large C programs. This is true, but it's potential is not limited to this!  
But this is not the only use for this very powerful tool.  
     −
A statistical analysis usually involves multiple data preparation steps just to mould the input into
+
At its core, It is a generic pipelining framework that is aware of dependencies and can run steps in parallel.  
a form that is acceptable by the analysis tool.  Analysis steps involving large data sets requires
  −
parallelization and this means partitioning the data into subsets that may be run independently on the
  −
cluster.  Upon completion of the analyses, the partial outputs have to be merged into a file again before
  −
plots are made to summarize the results. This is further compounded when one is interested in the effects
  −
of multiple parameter settings in an analysis.
     −
All these often result in hundreds of separate commands invoked and storing all these commands in
+
Statistical genetics analyses often requires multitudinous steps to prepare the data, run computationally expensive analyses and the collating the data.
a text file is probably not the efficient way.
+
 
 +
Make can potentially save you lots of time and hair pulling especially when your supervisor asks for ALL the analyses again but this time only with rare variants.
 +
 
 +
= Example =
 +
 
 +
This example does the following:
 +
 
 +
#generate 100 log files with a number written to it
 +
#concatenate the 100 log files into one file
 +
#delete the 100 log files
 +
 
 +
The example files may be found in /net/fantasia/home/atks/makefile_tutorial
 +
 
 +
  #generate make file using perl script
 +
  ./generate_simple_stuff
 +
 
 +
  #run make file sequentially
 +
  make -f simple_stuff.mk
 +
 
 +
  #run make file in parallel to at most 100 jobs
 +
  make -f simple_stuff.mk -j 100
 +
 
 +
  #clear files from run
 +
  make -f simple_stuff.mk -j 100
 +
 
 +
= Script =
 +
 
 +
<source lang=perl>
 +
#!/usr/bin/perl -w
 +
 
 +
use warnings;
 +
use strict;
 +
use POSIX;
 +
use Getopt::Long;
 +
use File::Path;
 +
use File::Basename;
 +
use Pod::Usage;
 +
 
 +
=head1 NAME
 +
 
 +
generate_simple_stuff_makefile
 +
 
 +
=head1 SYNOPSIS
 +
 
 +
generate_simple_stuff_makefile [options] 
 +
 
 +
  -o    output directory : location of all output files
 +
  -m    output make file
 +
 
 +
example: ./generate_simple_stuff_makefile.pl
 +
 
 +
=head1 DESCRIPTION
 +
 +
=cut
 +
 
 +
#option variables
 +
my $help;
 +
my $verbose;
 +
my $debug;
 +
my $outputDir = getcwd();
 +
my $makeFile = "simple_stuff.mk";
 +
 
 +
#initialize options
 +
Getopt::Long::Configure ('bundling');
 +
 
 +
if(!GetOptions ('h'=>\$help, 'v'=>\$verbose, 'd'=>\$debug,
 +
                'o:s'=>\$outputDir,
 +
                'm:s'=>\$makeFile)
 +
  || !defined($outputDir)
 +
  || scalar(@ARGV)!=0)
 +
{
 +
    if ($help)
 +
    {
 +
        pod2usage(-verbose => 2);
 +
    }
 +
    else
 +
    {
 +
        pod2usage(1);
 +
    }
 +
}
 +
 
 +
##############
 +
#print options
 +
##############
 +
printf("Options\n");
 +
printf("\n");
 +
printf("output directory : %s\n", $outputDir);
 +
printf("\n");
 +
 
 +
my @nodes = ();
 +
for my $i (140..171)
 +
{
 +
    push(@nodes, "$i");
 +
}
 +
my $nodes = join(",", @nodes);
 +
 
 +
#arrays for storing targets, dependencies and commands
 +
my @tgts = ();
 +
my @deps = ();
 +
my @cmds = ();
 +
 
 +
#temporary variables
 +
my $tgt;
 +
my $dep;
 +
my @cmd;
 +
 
 +
mkpath($outputDir);
 +
 
 +
my $inputFiles = "";
 +
my $inputFilesOK = "";
 +
my $inputFile = "";
 +
my $outputFile = "";
 +
 
 +
######################
 +
#1. Generate 100 files
 +
######################
 +
for my $i (1..100)
 +
{
 +
    $inputFiles .= " $outputDir/$i.log";
 +
    $inputFilesOK .= " $outputDir/$i.OK";
 +
    $tgt = "$outputDir/$i.OK";
 +
    $dep = "";
 +
    @cmd = ("echo $i > $outputDir/$i.log");
 +
    #makeLocalStep($tgt, $dep, @cmd);
 +
    makeSlurm($tgt, $dep, @cmd);
 +
}
 +
 
 +
#########################
 +
#2. Concatenate 100 files
 +
#########################
 +
$outputFile = "$outputDir/all.log";
 +
$tgt = "$outputFile.OK";
 +
$dep = $inputFilesOK;
 +
@cmd = ("cat $inputFiles > $outputFile");
 +
#makeLocalStep($tgt, $dep, @cmd);
 +
makeSlurm($tgt, $dep, @cmd);
 +
 +
###########################
 +
#3. Cleanup temporary files
 +
###########################
 +
$tgt = "$outputDir/cleaned.OK";
 +
$dep = "$outputDir/all.log";
 +
@cmd = ("rm $inputFiles");
 +
#makeLocalStep($tgt, $dep, @cmd);
 +
makeSlurm($tgt, $dep, @cmd);
 +
 +
#*******************
 +
#Write out make file
 +
#*******************
 +
open(MAK,">$makeFile") || die "Cannot open $makeFile\n";
 +
print MAK ".DELETE_ON_ERROR:\n\n";
 +
print MAK "all: @tgts\n\n";
 +
 
 +
#clean
 +
push(@tgts, "clean");
 +
push(@deps, "");
 +
push(@cmds, "\t-rm -rf $outputDir/*.OK $outputDir/*.log");
 +
 
 +
for(my $i=0; $i < @tgts; ++$i)
 +
{
 +
    print MAK "$tgts[$i]: $deps[$i]\n";
 +
    print MAK "$cmds[$i]\n";
 +
}
 +
close MAK;
 +
 
 +
##########
 +
#functions
 +
##########
 +
 
 +
#run slurm jobs
 +
sub makeSlurm
 +
{
 +
    my ($tgt, $dep, @cmd) = @_;
 +
 
 +
    push(@tgts, $tgt);
 +
    push(@deps, $dep);
 +
    my $cmd = "";
 +
    for my $c (@cmd)
 +
    {
 +
        $cmd .= "\tsrun " . $c . "\n";
 +
    }
 +
    $cmd .= "\ttouch $tgt\n";
 +
    push(@cmds, $cmd);
 +
}
 +
 
 +
#run a local job
 +
sub makeLocalStep
 +
{
 +
    my ($tgt, $dep, @cmd) = @_;
 +
 
 +
    push(@tgts, $tgt);
 +
    push(@deps, $dep);
 +
    my $cmd = "";
 +
    for my $c (@cmd)
 +
    {
 +
        $cmd .= "\t" . $c . "\n";
 +
    }
 +
    $cmd .= "\ttouch $tgt\n";
 +
    push(@cmds, $cmd);
 +
}
 +
</source>
    
= Solution =
 
= Solution =
1,102

edits

Navigation menu