Difference between revisions of "Sam Library Usage Examples"

From Genome Analysis Wiki
Jump to navigationJump to search
Line 90: Line 90:
  
  
== Simple Read/Write Indexed BAM Files ==
+
== Read Only One Reference/Chromosome from Sorted/Indexed BAM Files ==
 
This example reads in the inputFilename bam file and writes it back out section by section to the specified outputFilename, starting with section -1.  It also prints a count of the number of records in each section.
 
This example reads in the inputFilename bam file and writes it back out section by section to the specified outputFilename, starting with section -1.  It also prints a count of the number of records in each section.
 
<source lang="cpp">
 
<source lang="cpp">
Line 173: Line 173:
 
}
 
}
 
</source>
 
</source>
 +
 +
 +
== Read a Specified Region from a Sorted & Indexed BAM File ==
 +
This example is found in pipeline/bam/WriteRegion.cpp and is part of the bam executable.
 +
 +
 +
<source lang="cpp">
 +
#include "SamFile.h"
 +
#include "Parameters.h"
 +
#include "BgzfFileType.h"
 +
 +
int writeRegion(int argc, char **argv)
 +
{
 +
    // Extract command line arguments.
 +
    static const int UNSPECIFIED_INT = -1;
 +
    String inFile = "";
 +
    String outFile = "";
 +
    String indexFile = "";
 +
    String refName = "";
 +
    int refID = UNSPECIFIED_INT;
 +
    int start = UNSPECIFIED_INT;
 +
    int end = UNSPECIFIED_INT;
 +
    bool noeof = false;
 +
 +
    ParameterList inputParameters;
 +
    BEGIN_LONG_PARAMETERS(longParameterList)
 +
        LONG_STRINGPARAMETER("in", &inFile)
 +
        LONG_STRINGPARAMETER("out", &outFile)
 +
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
 +
        LONG_STRINGPARAMETER("refName", &refName)
 +
        LONG_INTPARAMETER("refID", &refID)
 +
        LONG_INTPARAMETER("start", &start)
 +
        LONG_INTPARAMETER("end", &end)
 +
        LONG_PARAMETER("noeof", &noeof)
 +
        END_LONG_PARAMETERS();
 +
 
 +
    inputParameters.Add(new LongParameters ("Input Parameters",
 +
                                            longParameterList));
 +
 +
    inputParameters.Read(argc-1, &(argv[1]));
 +
 +
    inputParameters.Status();
 +
 
 +
    // If no eof block is required for a bgzf file, set the bgzf file type to
 +
    // not look for it.
 +
    if(noeof)
 +
    {
 +
        // Set that the eof block is not required.
 +
        BgzfFileType::setRequireEofBlock(false);
 +
    }
 +
 +
    // Check to see if the in file was specified, if not, report an error.
 +
    if(inFile == "")
 +
    {
 +
        writeRegionUsage();
 +
        // mandatory argument was not specified.
 +
        std::cerr << "Missing mandatory argument: --in" << std::endl;
 +
        return(-1);
 +
    }
 +
    if(outFile == "")
 +
    {
 +
        writeRegionUsage();
 +
        // mandatory argument was not specified.
 +
        std::cerr << "Missing mandatory argument: --out" << std::endl;
 +
        return(-1);
 +
    }
 +
   
 +
    if(indexFile == "")
 +
    {
 +
        // In file was not specified, so set it to the in file
 +
        // + ".bai"
 +
        indexFile = inFile + ".bai";
 +
    }
 +
 +
    if(refID != -1 && refName.Length() != 0)
 +
    {
 +
        std::cerr << "Can't specify both refID and refName" << std::endl;
 +
        return(-1);
 +
    }
 +
 +
    SamFile samIn;
 +
    // Open the file for reading. 
 +
    if(!samIn.OpenForRead(inFile))
 +
    {
 +
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
 +
        return(samIn.GetStatus());
 +
    }
 +
 +
    // If refName is set, use that.
 +
    if(refName.Length() != 0)
 +
    {
 +
        // Use Reference Name.
 +
        if(!samIn.SetReadSection(refName.c_str(), start, end))
 +
        {
 +
            fprintf(stderr, "%s\n", samIn.GetStatusMessage());
 +
            return(samIn.GetStatus());
 +
        }
 +
    }
 +
    else
 +
    {
 +
        // Use Reference ID
 +
        if(!samIn.SetReadSection(refID, start, end))
 +
        {
 +
            fprintf(stderr, "%s\n", samIn.GetStatusMessage());
 +
            return(samIn.GetStatus());
 +
        }
 +
    }
 +
 +
    // Open the output file for writing.
 +
    SamFile samOut;
 +
    if(!samOut.OpenForWrite(outFile))
 +
    {
 +
        fprintf(stderr, "%s\n", samOut.GetStatusMessage());
 +
        return(samOut.GetStatus());
 +
    }
 +
 +
    // Open the bam index file for reading.
 +
    if(!samIn.ReadBamIndex(indexFile))
 +
    {
 +
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
 +
        return(samIn.GetStatus());
 +
    }
 +
 +
    // Read the sam header.
 +
    SamFileHeader samHeader;
 +
    if(!samIn.ReadHeader(samHeader))
 +
    {
 +
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
 +
        return(samIn.GetStatus());
 +
    }
 +
   
 +
    // Write the sam header.
 +
    if(!samOut.WriteHeader(samHeader))
 +
    {
 +
        fprintf(stderr, "%s\n", samOut.GetStatusMessage());
 +
        return(samOut.GetStatus());   
 +
    }
 +
 +
    // Read the sam records.
 +
    SamRecord samRecord;
 +
    // Track the status.
 +
    int numSectionRecords = 0;
 +
 +
    // Set returnStatus to success.  It will be changed
 +
    // to the failure reason if any of the writes fail.
 +
    SamStatus::Status returnStatus = SamStatus::SUCCESS;
 +
 +
    // Keep reading records until they aren't anymore.
 +
    while(samIn.ReadRecord(samHeader, samRecord))
 +
    {
 +
        // Successfully read a record from the file, so write it.
 +
        if(!samOut.WriteRecord(samHeader, samRecord))
 +
        {
 +
            // Failed to write a record.
 +
            fprintf(stderr, "%s\n", samOut.GetStatusMessage());
 +
            returnStatus = samOut.GetStatus();
 +
        }
 +
        ++numSectionRecords;
 +
    }
 +
 +
    if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
 +
    {
 +
        // Failed to read a record.
 +
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
 +
        // Set the return status to the reason why
 +
        // the read failed.
 +
        returnStatus = samIn.GetStatus();
 +
    }
 +
 +
    std::cout << "Wrote " << outFile << " with " << numSectionRecords << " records.\n";
 +
    return(returnStatus);
 +
}
 +
</source>
 +
  
 
== Validate SAM/BAM Files Continuing Processing After Allowed Errors ==
 
== Validate SAM/BAM Files Continuing Processing After Allowed Errors ==

Revision as of 10:42, 2 August 2010

Usage Examples

This page contains examples on how to use the SAM/BAM Library.

Simple Read/Write SAM/BAM files

The following example reads in a sam/bam file and writes it out as a sam/bam file. The file format of the input sam/bam is determined by the SamFile class based on reading the type from the file. The file format of the output sam/bam file is determined by the SamFile class based on the extension of the output file. A ".bam" extension indicates a BAM file. All other extensions indicate SAM files.

#include "SamFile.h"
#include "SamValidation.h"

int main(int argc, char ** argv)
{
   // Check for the appropriate number of arguments.
   if(argc != 3)
   {
      printf("./bam <inputFile> <outputFile.sam/bam>\n");
      exit(-1);
   }

   // Open the input file for reading.
   SamFile samIn;
   if(!samIn.OpenForRead(argv[1]))
   {
      fprintf(stderr, "%s\n", samIn.GetStatusMessage());
      return(samIn.GetStatus());
   }

   // Open the output file for writing.
   SamFile samOut;
   if(!samOut.OpenForWrite(argv[2]))
   {
      fprintf(stderr, "%s\n", samOut.GetStatusMessage());
      return(samOut.GetStatus());
   }

   // Read the sam header.
   SamFileHeader samHeader;
   if(!samIn.ReadHeader(samHeader))
   {
      fprintf(stderr, "%s\n", samIn.GetStatusMessage());
      return(samIn.GetStatus());
   }

   // Write the sam header.
   if(!samOut.WriteHeader(samHeader))
   {
      fprintf(stderr, "%s\n", samOut.GetStatusMessage());
      return(samOut.GetStatus());     
   }

   SamRecord samRecord;
   SamValidationErrors samInvalidErrors;

    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;

    // Keep reading records until ReadRecord returns false.
    while(samIn.ReadRecord(samHeader, samRecord))
    {
        // Successfully read a record from the file, so write it.
        if(!samOut.WriteRecord(samHeader, samRecord))
        {
            // Failed to write a record.
            fprintf(stderr, "%s\n", samOut.GetStatusMessage());
            returnStatus = samOut.GetStatus();
        }
    }

    if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
    {
        // Failed to read a record.
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        // Set the return status to the reason why
        // the read failed.
        returnStatus = samIn.GetStatus();
    }

    std::cout << std::endl << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;
    std::cout << "Number of records written = " << 
        samOut.GetCurrentRecordCount() << std::endl;

    // Since the reads were successful, return the status based
    // on the status of the writes.  If any failed, return
    // their failure status.
    return(returnStatus);
 }


Read Only One Reference/Chromosome from Sorted/Indexed BAM Files

This example reads in the inputFilename bam file and writes it back out section by section to the specified outputFilename, starting with section -1. It also prints a count of the number of records in each section.

#include "SamFile.h"
#include "SamValidation.h"

int ReadIndexedBam(const char* inputFilename,
                   const char* outputFilename,
                   const char* indexFilename)
{
   // Open the input file for reading.
   SamFile samIn;
   if(!samIn.OpenForRead(inputFilename))
   {
      fprintf(stderr, "%s\n", samIn.GetStatusMessage());
      return(samIn.GetStatus());
   }

   // Open the bam index file for reading.
   if(!samIn.ReadBamIndex(indexFilename))
   {
      fprintf(stderr, "%s\n", samIn.GetStatusMessage());
      return(samIn.GetStatus());
   }

   // Open the output file for writing.
   SamFile samOut;
   if(!samOut.OpenForWrite(outputFilename))
   {
      fprintf(stderr, "%s\n", samOut.GetStatusMessage());
      return(samOut.GetStatus());
   }

   // Read the sam header.
   SamFileHeader samHeader;
   if(!samIn.ReadHeader(samHeader))
   {
      fprintf(stderr, "%s\n", samIn.GetStatusMessage());
      return(samIn.GetStatus());
   }

   // Write the sam header.
   if(!samOut.WriteHeader(samHeader))
   {
      fprintf(stderr, "%s\n", samOut.GetStatusMessage());
      return(samOut.GetStatus());     
   }

   SamRecord samRecord;
   SamValidationErrors samInvalidErrors;

   // Loop through each Reference.
   for(int i = -1; i < 23; i++)
   {
      int numSectionRecords = 0;
      samIn.SetReadSection(i);
      // Keep reading records until they aren't more.
      while(samIn.ReadRecord(samHeader, samRecord))
      {
         numSectionRecords++;
         // Successfully read a record from the file, so write it.
         if(!samOut.WriteRecord(samHeader, samRecord))
         {
            // Failed to write a record.
            fprintf(stderr, "%s\n", samOut.GetStatusMessage());
         }
      }

      if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
      {
         // Failed to read a record.
         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
      }

      std::cout << "Reference ID " << i << " has " << numSectionRecords 
                << " records" << std::endl;
   }
   
   std::cout << "Number of records = " << samIn.GetCurrentRecordCount() << std::endl;
   
   return(0);
}


Read a Specified Region from a Sorted & Indexed BAM File

This example is found in pipeline/bam/WriteRegion.cpp and is part of the bam executable.


#include "SamFile.h"
#include "Parameters.h"
#include "BgzfFileType.h"

int writeRegion(int argc, char **argv)
{
    // Extract command line arguments.
    static const int UNSPECIFIED_INT = -1;
    String inFile = "";
    String outFile = "";
    String indexFile = "";
    String refName = "";
    int refID = UNSPECIFIED_INT;
    int start = UNSPECIFIED_INT;
    int end = UNSPECIFIED_INT;
    bool noeof = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("refName", &refName)
        LONG_INTPARAMETER("refID", &refID)
        LONG_INTPARAMETER("start", &start)
        LONG_INTPARAMETER("end", &end)
        LONG_PARAMETER("noeof", &noeof)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    inputParameters.Status();
   
    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        writeRegionUsage();
        // mandatory argument was not specified.
        std::cerr << "Missing mandatory argument: --in" << std::endl;
        return(-1);
    }
    if(outFile == "")
    {
        writeRegionUsage();
        // mandatory argument was not specified.
        std::cerr << "Missing mandatory argument: --out" << std::endl;
        return(-1);
    }
    
    if(indexFile == "")
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }

    if(refID != -1 && refName.Length() != 0)
    {
        std::cerr << "Can't specify both refID and refName" << std::endl;
        return(-1);
    }

    SamFile samIn;
    // Open the file for reading.   
    if(!samIn.OpenForRead(inFile))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // If refName is set, use that.
    if(refName.Length() != 0)
    {
        // Use Reference Name.
        if(!samIn.SetReadSection(refName.c_str(), start, end))
        {
            fprintf(stderr, "%s\n", samIn.GetStatusMessage());
            return(samIn.GetStatus());
        }
    }
    else
    {
        // Use Reference ID
        if(!samIn.SetReadSection(refID, start, end))
        {
            fprintf(stderr, "%s\n", samIn.GetStatusMessage());
            return(samIn.GetStatus());
         }
    }

    // Open the output file for writing.
    SamFile samOut;
    if(!samOut.OpenForWrite(outFile))
    {
        fprintf(stderr, "%s\n", samOut.GetStatusMessage());
        return(samOut.GetStatus());
    }

    // Open the bam index file for reading.
    if(!samIn.ReadBamIndex(indexFile))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Read the sam header.
    SamFileHeader samHeader;
    if(!samIn.ReadHeader(samHeader))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }
    
    // Write the sam header.
    if(!samOut.WriteHeader(samHeader))
    {
        fprintf(stderr, "%s\n", samOut.GetStatusMessage());
        return(samOut.GetStatus());     
    }

    // Read the sam records.
    SamRecord samRecord;
    // Track the status.
    int numSectionRecords = 0;

    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;

    // Keep reading records until they aren't anymore.
    while(samIn.ReadRecord(samHeader, samRecord))
    {
        // Successfully read a record from the file, so write it.
        if(!samOut.WriteRecord(samHeader, samRecord))
        {
            // Failed to write a record.
            fprintf(stderr, "%s\n", samOut.GetStatusMessage());
            returnStatus = samOut.GetStatus();
        }
        ++numSectionRecords;
    }

    if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
    {
        // Failed to read a record.
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        // Set the return status to the reason why
        // the read failed.
        returnStatus = samIn.GetStatus();
    }

    std::cout << "Wrote " << outFile << " with " << numSectionRecords << " records.\n";
    return(returnStatus);
}


Validate SAM/BAM Files Continuing Processing After Allowed Errors

This example demonstrates calling validation on a SAM/BAM file. It also demonstrates a more complicated read loop that continues until a configurable number of errors are hit.

#include "SamFile.h"
#include "SamValidation.h"

int Validate(String inFile, int quitAfterErrorNum, int maxReportedErrors, SamFile::SortedType sortType)
{
   // Check to see if the in file was specified.  If it wasn't this is an error.
   if(inFile == "")
   {
      // In file was not specified but it is mandatory.
      std::cerr << "--in is a mandatory argument for validate, "
                << "but was not specified" << std::endl;
      return(-1);
   }

   SamFile samIn;
   // Open the file for reading.   
   if(!samIn.OpenForRead(inFile))
   {
      fprintf(stderr, "%s\n", samIn.GetStatusMessage());
      return(samIn.GetStatus());
   }

   // Set the sorting validation type.
   samIn.setSortedValidation(sortType);

   // Read the sam header.
   SamFileHeader samHeader;
   if(!samIn.ReadHeader(samHeader))
   {
      fprintf(stderr, "%s\n", samIn.GetStatusMessage());
      return(samIn.GetStatus());
   }

   // Read the sam records.
   SamRecord samRecord;

   // Track the status.
   SamStatus::Status status = SamStatus::SUCCESS;

   // Keep reading records until the end of the file is reached.
   int numValidRecords = 0;
   int numInvalidRecords = 0;
   int numErrorRecords = 0;
   int numRecords = 0;
   int numReportedErrors = 0;
   int totalErrorRecords = 0;

   SamValidationErrors invalidSamErrors;

   // Keep reading records from the file until SamFile::ReadRecord
   // indicates to stop (returns false).
   while( (samIn.ReadRecord(samHeader, samRecord))||
          (SamStatus::isContinuableStatus(samIn.GetStatus()) &&
           ( (quitAfterErrorNum < 0) ||
             (numErrorRecords < quitAfterErrorNum) )) )
   {
      ++numRecords;
      if(samIn.GetStatus() == SamStatus::SUCCESS)
      {
         // Successfully set the record, so check to see if it is valid.
         // Clear any errors in the list.
         invalidSamErrors.clear();
         if(!SamValidator::isValid(samHeader, samRecord, invalidSamErrors))
         {
            // The record is not valid.
            ++numInvalidRecords;
            ++totalErrorRecords;
            if(numReportedErrors < maxReportedErrors)
            {
               std::cout << "Record " << numRecords << std::endl
                         << invalidSamErrors << std::endl;
               numReportedErrors += invalidSamErrors.numErrors();
               status = SamStatus::INVALID;
            }
         }
         else
         {
            // Valid record, so increment the counter.
            ++numValidRecords;
         }
      }
      else
      {
         // Error reading the record.
         ++numErrorRecords;
         if(numReportedErrors < maxReportedErrors)
         {
            // report error.
            std::cout << "Record " << numRecords << std::endl
                      << samIn.GetStatusMessage() << std::endl
                      << std::endl;
            ++numReportedErrors;
            ++totalErrorRecords;
            status = samIn.GetStatus();
         }
      }
   }

   if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
   {
      ++numErrorRecords;
      // The last read call had a failure, so report it.
      if(numReportedErrors < maxReportedErrors)
      {
         std::cout << "Record " << numRecords << ": ";
         std::cout << std::endl << samIn.GetStatusMessage() << std::endl;
      }
      status = samIn.GetStatus();
   }

   printf("\nNumber of records read = %d\n", numRecords);
   printf("Number of valid records = %d\n", numValidRecords);
   printf("Returning: %d (%s)\n", status, SamStatus::getStatusString(status));
   return(status);
}