Sam Library Usage Examples

From Genome Analysis Wiki
Revision as of 11:58, 8 October 2010 by Mktrost (talk | contribs)
Jump to navigationJump to search

Usage Examples

This page contains examples on how to use the SAM/BAM Library.

Simple Read/Write SAM/BAM files

The following example reads in a sam/bam file and writes it out as a sam/bam file. The file format of the input sam/bam is determined by the SamFile class based on reading the type from the file. The file format of the output sam/bam file is determined by the SamFile class based on the extension of the output file. A ".bam" extension indicates a BAM file. All other extensions indicate SAM files.

#include "SamFile.h"
#include "SamValidation.h"

int main(int argc, char ** argv)
{
   // Check for the appropriate number of arguments.
   if(argc != 3)
   {
      printf("./bam <inputFile> <outputFile.sam/bam>\n");
      exit(-1);
   }

   // Open the input file for reading.
   SamFile samIn;
   samIn.OpenForRead(argv[1]);

   // Open the output file for writing.
   SamFile samOut;
   samOut.OpenForWrite(argv[2]);

   // Read the sam header.
   SamFileHeader samHeader;
   samIn.ReadHeader(samHeader);

   // Write the sam header.
   samOut.WriteHeader(samHeader);

   SamRecord samRecord;
   SamValidationErrors samInvalidErrors;

    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;

    // Keep reading records until ReadRecord returns false.
    while(samIn.ReadRecord(samHeader, samRecord))
    {
        // Successfully read a record from the file, so write it.
        samOut.WriteRecord(samHeader, samRecord);
    }

    std::cout << std::endl << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;
    std::cout << "Number of records written = " << 
        samOut.GetCurrentRecordCount() << std::endl;

    // Return success since a failure would have thrown
    // an exception.
    return(returnStatus);
 }


Read Only One Reference/Chromosome from Sorted/Indexed BAM Files

This example reads in the inputFilename bam file and writes it back out section by section to the specified outputFilename, starting with section -1. It also prints a count of the number of records in each section.

#include "SamFile.h"
#include "SamValidation.h"

int ReadIndexedBam(const char* inputFilename,
                   const char* outputFilename,
                   const char* indexFilename)
{
   // Open the input file for reading.
   SamFile samIn;
   samIn.OpenForRead(inputFilename);

   // Open the bam index file for reading.
   samIn.ReadBamIndex(indexFilename);

   // Open the output file for writing.
   SamFile samOut;
   samOut.OpenForWrite(outputFilename);

   // Read the sam header.
   SamFileHeader samHeader;
   samIn.ReadHeader(samHeader);

   // Write the sam header.
   samOut.WriteHeader(samHeader);

   SamRecord samRecord;
   SamValidationErrors samInvalidErrors;

   // Loop through each Reference.
   for(int i = -1; i < 23; i++)
   {
      int numSectionRecords = 0;
      samIn.SetReadSection(i);
      // Keep reading records until they aren't more.
      while(samIn.ReadRecord(samHeader, samRecord))
      {
         numSectionRecords++;
         // Successfully read a record from the file, so write it.
         samOut.WriteRecord(samHeader, samRecord);
      }

      std::cout << "Reference ID " << i << " has " << numSectionRecords 
                << " records" << std::endl;
   }
   
   std::cout << "Number of records = " << samIn.GetCurrentRecordCount() << std::endl;
   
   return(0);
}


Read a Specified Region from a Sorted & Indexed BAM File

This example is found in pipeline/bam/WriteRegion.cpp and is part of the bam executable.


#include "SamFile.h"
#include "Parameters.h"
#include "BgzfFileType.h"

int writeRegion(int argc, char **argv)
{
    // Extract command line arguments.
    static const int UNSPECIFIED_INT = -1;
    String inFile = "";
    String outFile = "";
    String indexFile = "";
    String refName = "";
    int refID = UNSPECIFIED_INT;
    int start = UNSPECIFIED_INT;
    int end = UNSPECIFIED_INT;
    bool noeof = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("refName", &refName)
        LONG_INTPARAMETER("refID", &refID)
        LONG_INTPARAMETER("start", &start)
        LONG_INTPARAMETER("end", &end)
        LONG_PARAMETER("noeof", &noeof)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    inputParameters.Status();
   
    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        writeRegionUsage();
        // mandatory argument was not specified.
        std::cerr << "Missing mandatory argument: --in" << std::endl;
        return(-1);
    }
    if(outFile == "")
    {
        writeRegionUsage();
        // mandatory argument was not specified.
        std::cerr << "Missing mandatory argument: --out" << std::endl;
        return(-1);
    }
    
    if(indexFile == "")
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }

    if(refID != -1 && refName.Length() != 0)
    {
        std::cerr << "Can't specify both refID and refName" << std::endl;
        return(-1);
    }

    SamFile samIn;
    // Open the file for reading.   
    samIn.OpenForRead(inFile);

    // If refName is set, use that.
    if(refName.Length() != 0)
    {
        // Use Reference Name.
        samIn.SetReadSection(refName.c_str(), start, end);
    }
    else
    {
        // Use Reference ID
        samIn.SetReadSection(refID, start, end);
    }

    // Open the output file for writing.
    SamFile samOut;
    samOut.OpenForWrite(outFile);

    // Open the bam index file for reading.
    samIn.ReadBamIndex(indexFile);

    // Read & write the sam header.
    SamFileHeader samHeader;
    samIn.ReadHeader(samHeader);
    samOut.WriteHeader(samHeader);

    // Read the sam records.
    SamRecord samRecord;
    // Track the status.
    int numSectionRecords = 0;

    // Set returnStatus to success. 
    SamStatus::Status returnStatus = SamStatus::SUCCESS;

    // Keep reading records until they aren't anymore.
    while(samIn.ReadRecord(samHeader, samRecord))
    {
        // Successfully read a record from the file, so write it.
        samOut.WriteRecord(samHeader, samRecord);
        ++numSectionRecords;
    }

    std::cout << "Wrote " << outFile << " with " << numSectionRecords << " records.\n";
    return(returnStatus);
}


Validate SAM/BAM Files Continuing Processing After Allowed Errors

This example demonstrates calling validation on a SAM/BAM file. It also demonstrates a more complicated read loop that continues until a configurable number of errors are hit.

#include "SamFile.h"
#include "SamValidation.h"

int Validate(String inFile, int quitAfterErrorNum, int maxReportedErrors, SamFile::SortedType sortType)
{
   // Check to see if the in file was specified.  If it wasn't this is an error.
   if(inFile == "")
   {
      // In file was not specified but it is mandatory.
      std::cerr << "--in is a mandatory argument for validate, "
                << "but was not specified" << std::endl;
      return(-1);
   }

   SamFile samIn(ErrorHandler::RETURN);
   // Open the file for reading.   
   if(!samIn.OpenForRead(inFile))
   {
      fprintf(stderr, "%s\n", samIn.GetStatusMessage());
      return(samIn.GetStatus());
   }

   // Set the sorting validation type.
   samIn.setSortedValidation(sortType);

   // Read the sam header.
   SamFileHeader samHeader;
   if(!samIn.ReadHeader(samHeader))
   {
      fprintf(stderr, "%s\n", samIn.GetStatusMessage());
      return(samIn.GetStatus());
   }

   // Read the sam records.
   SamRecord samRecord;

   // Track the status.
   SamStatus::Status status = SamStatus::SUCCESS;

   // Keep reading records until the end of the file is reached.
   int numValidRecords = 0;
   int numInvalidRecords = 0;
   int numErrorRecords = 0;
   int numRecords = 0;
   int numReportedErrors = 0;
   int totalErrorRecords = 0;

   SamValidationErrors invalidSamErrors;

   // Keep reading records from the file until SamFile::ReadRecord
   // indicates to stop (returns false).
   while( (samIn.ReadRecord(samHeader, samRecord))||
          (SamStatus::isContinuableStatus(samIn.GetStatus()) &&
           ( (quitAfterErrorNum < 0) ||
             (numErrorRecords < quitAfterErrorNum) )) )
   {
      ++numRecords;
      if(samIn.GetStatus() == SamStatus::SUCCESS)
      {
         // Successfully set the record, so check to see if it is valid.
         // Clear any errors in the list.
         invalidSamErrors.clear();
         if(!SamValidator::isValid(samHeader, samRecord, invalidSamErrors))
         {
            // The record is not valid.
            ++numInvalidRecords;
            ++totalErrorRecords;
            if(numReportedErrors < maxReportedErrors)
            {
               std::cout << "Record " << numRecords << std::endl
                         << invalidSamErrors << std::endl;
               numReportedErrors += invalidSamErrors.numErrors();
               status = SamStatus::INVALID;
            }
         }
         else
         {
            // Valid record, so increment the counter.
            ++numValidRecords;
         }
      }
      else
      {
         // Error reading the record.
         ++numErrorRecords;
         if(numReportedErrors < maxReportedErrors)
         {
            // report error.
            std::cout << "Record " << numRecords << std::endl
                      << samIn.GetStatusMessage() << std::endl
                      << std::endl;
            ++numReportedErrors;
            ++totalErrorRecords;
            status = samIn.GetStatus();
         }
      }
   }

   if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
   {
      ++numErrorRecords;
      // The last read call had a failure, so report it.
      if(numReportedErrors < maxReportedErrors)
      {
         std::cout << "Record " << numRecords << ": ";
         std::cout << std::endl << samIn.GetStatusMessage() << std::endl;
      }
      status = samIn.GetStatus();
   }

   printf("\nNumber of records read = %d\n", numRecords);
   printf("Number of valid records = %d\n", numValidRecords);
   printf("Returning: %d (%s)\n", status, SamStatus::getStatusString(status));
   return(status);
}