|
|
(62 intermediate revisions by one other user not shown) |
Line 1: |
Line 1: |
| + | [[Category:C++]] |
| + | [[Category:libStatGen]] |
| + | [[Category:libStatGen BAM]] |
| + | |
| = SAM/BAM File= | | = SAM/BAM File= |
| | | |
− | == Reading/Writing SAM/BAM Files ==
| + | See the github history: https://github.com/statgen/libStatGen/commits/master/bam for a list of the most recent updates to the BAM classes. |
− | The SamFile class allows a user to easily read/write a SAM/BAM file.
| |
− | The methods found in this class are:
| |
− | {| class="wikitable" style="width:100%" border="1"
| |
− | |+ style="font-size:150%"|'''SamFile Class Methods'''
| |
− | ! width=""|Method Name
| |
− | ! width=""|Description
| |
− | |-
| |
− | | bool OpenForRead(const char* filename)
| |
− | | Opens the specified file for reading.
| |
− | Determines if it is a BAM/SAM file by reading the beginning of the file.
| |
− | Returns true if successfully opened reading, false if not.
| |
− | |-
| |
− | | bool OpenForWrite(const char * filename)
| |
− | | bool: true if successfully opened, false if not.
| |
− | Opens as BAM file if the specified filename ends in .bam. Otherwise it is opened as a SAM file.
| |
− | Returns true if successfully opened for writing, false if not.
| |
− | |-
| |
− | | bool ReadHeader(SamFileHeader& header)
| |
− | | Reads the header section from the file and stores it in the passed in header.
| |
− | Returns true if successfully read, false if not.
| |
− | |-
| |
− | | bool WriteHeader(const SamFileHeader& header)
| |
− | | Writes the specified header into the file.
| |
− | Returns true if successfully written, false if not.
| |
− | |-
| |
− | | bool ReadRecord(SamFileHeader& header, SamRecord& record)
| |
− | | Reads the next record from the file and stores it in the passed in record.
| |
− | Returns true if successfully read, false if not.
| |
− | |-
| |
− | | bool WriteRecord(SamFileHeader& header, SamRecord& record)
| |
− | | Writes the specified record into the file.
| |
− | Returns true if successfully written, false if not.
| |
− | |}
| |
− | | |
− | === Usage Example ===
| |
− | The following example reads in a sam/bam file and writes it out as a sam/bam file. The file format of the input sam/bam is determined by the SamFile class based on reading the type from the file. The file format of the output sam/bam file is determined by the SamFile class based on the extension of the output file. A ".bam" extension indicates a BAM file. All other extensions indicate SAM files.
| |
− | <pre>
| |
− | int main(int argc, char ** argv)
| |
− | {
| |
− | if(argc != 3)
| |
− | {
| |
− | printf("./bam <inputFile> <outputFile.sam/bam>\n");
| |
− | exit(-1);
| |
− | }
| |
− | | |
− | | |
− | SamFile samIn;
| |
− |
| |
− | samIn.OpenForRead(argv[1]);
| |
− | | |
− | SamFile samOut;
| |
− | | |
− | samOut.OpenForWrite(argv[2]);
| |
− | | |
− | // Read the sam header.
| |
− | SamFileHeader samHeader;
| |
− | samIn.ReadHeader(samHeader);
| |
| | | |
− | samOut.WriteHeader(samHeader);
| + | [[BAM Review Action Items|Old BAM Review Action Items]] |
| | | |
− | // Read the first sam record.
| + | == Read & Write BAM/SAM Library Software == |
− | SamRecord samRecord;
| |
| | | |
− | // Keep reading records until it fails.
| + | The software reads the beginning of files opened for reading to determine if it is SAM/BAM. To determine the format (SAM/BAM) of files open for writing, the software checks the output file's extension. If the extension is "bam" it writes a BAM file, otherwise it writes a SAM file. |
− | int recordCount = 0;
| |
− | while (samIn.ReadRecord(samHeader, samRecord) == true)
| |
− | {
| |
− | recordCount++;
| |
− | samOut.WriteRecord(samHeader, samRecord);
| |
− | }
| |
− | printf("RecordCount = %d\n", recordCount);
| |
− | }
| |
− | </pre>
| |
| | | |
| + | The library is found in statgen/lib/bam. |
| | | |
− | == Setting fields in a SAM/BAM Header == | + | === BAM/SAM Classes === |
− | The SamRecord class contains accessors to set the header lines of a SAM/BAM header. By using these set methods to setup the header, they can be pulled back out using the get accessors or the header can be later written to a SAM/BAM file.
| + | {| style="margin: 1em 1em 1em 0; background-color: #f9f9f9; border: 1px #aaa solid; border-collapse: collapse;" border="1" |
− | The methods found in the SamFileHeader class for setting fields are:
| + | |-style="background: #f2f2f2; text-align: center;" |
− | {| class="wikitable" style="width:100%" border="1" | + | ! Class Name !! Description |
− | |+ style="font-size:150%"|'''SamFile Class Methods''' | |
− | ! width=""|Method Name | |
− | ! width=""|Description | |
− | |-
| |
− | | bool addHeaderLine(const char* type, const char* tag, int value)
| |
− | | Adds the type, tag, and integer value to the header.
| |
− | Returns true if successfully added, false if not.
| |
| |- | | |- |
− | | bool addHeaderLine(const char* type, const char* tag, const char* value) | + | | <code>[[C++ Class: SamFile|SamFile]]</code> |
− | | Adds the type, tag, and const char* value to the header. | + | | Class used for reading/writing SAM/BAM files and their headers and records. |
− | Returns true if successfully added, false if not.
| |
| |- | | |- |
− | | bool addHeaderLine(const char* headerLine) | + | | <code>[[C++ Class: SamFileHeader|SamFileHeader]]</code> |
− | | Adds the already setup/formatted headerLine to the header. It is assumed that the line does not contain a “\n”. | + | | Class used for storing the header. Allows access for getting and setting header values when both reading & writing SAM/BAM files. |
− | Returns true if successfully added, false if not.
| |
− | |- | |
− | |}
| |
− | | |
− | | |
− | == Setting fields in a SAM/BAM Record ==
| |
− | The SamRecord class contains accessors to set the fields of a SAM/BAM record. They are used for creating a record that is not read from a SAM/BAM file. By using these set methods to setup the record, they can be pulled back out using the get accessors or the record can be later written as either a SAM/BAM record.
| |
− | The methods found in the SamRecord class for setting fields are:
| |
− | {| class="wikitable" style="width:100%" border="1"
| |
− | |+ style="font-size:150%"|'''SamFile Class Methods'''
| |
− | ! width=""|Method Name
| |
− | ! width=""|Description
| |
| |- | | |- |
− | | bool setReadName(const char* readName) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classSamHeaderRecord.html SamHeaderRecord]</code> |
− | | Sets QNAME to the passed in name. | + | | Class used for storing the tag/value pairs within a given header line. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool setFlag(int flag) | + | | <code>[[C++ Class: SamRecord|SamRecord]]</code> |
− | | Sets the bitwise FLAG to the passed in value. | + | | Class used for storing a SAM/BAM Record. Allows access for getting and setting record values when both reading & writing SAM/BAM files. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool setReferenceID(int referenceID) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classSamStatus.html SamStatus]</code> |
− | | Sets the reference sequence id. The reference name is not currently stored. A map to the header needs to be done to get this (which is done when writing a SAM file). THIS is an opportunity for improvement.
| + | | Status value used by the BAM classes for returning the status of the operations. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool set1BasedPosition(int position) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classSamValidator.html SamValidator]</code> |
− | | Sets the leftmost position. The value passed in is 1-based (SAM formatted). Internal processing handles switching between SAM/BAM formats when read/written.
| + | | Validates a SAM/BAM Record. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool set0BasedPosition(int position) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classSamValidationError.html SamValidationError]</code> |
− | | Sets the leftmost position. The value passed in is 0-based (BAM formatted). Internal processing handles switching between SAM/BAM formats when read/written.
| + | | Validation Error Information for a SamRecord. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | |bool setMapQuality(int mapQuality) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classSamValidationErrors.html SamValidationErrors]</code> |
− | | Sets the mapping quality. | + | | Container for ValidationErrors. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool setCigar(const char* cigar) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classPileup.html Pileup]</code> |
− | | Sets the cigar string to the passed in CIGAR. This is a SAM formatted CIGAR string. Internal processing handles switching between SAM/BAM formats when read/written.
| + | | Template for doing pileups. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool setMateReferenceID(int mateReferenceID) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classPileupElement.html PileupElement]</code> |
− | | Sets the mate reference sequence id. The mate reference name is not currently stored. A map to the header needs to be done to get this (which is done when writing a SAM file). THIS is an opportunity for improvement. | + | | Base class that can be used for the elements stored in a Pileup. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool set1BasedMatePosition(int matePosition) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classErrorHandler.html ErrorHandler]</code> |
− | | Sets the leftmost mate position. The value passed in is 1-based (SAM formatted). Internal processing handles switching between SAM/BAM formats when read/written.
| + | | Class for handling errors based on the error handling type. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool set0BasedMatePosition(int matePosition) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classPosList.html PosList]</code> |
− | | Sets the leftmost mate position. The value passed in is 0-based (BAM formatted). Internal processing handles switching between SAM/BAM formats when read/written.
| + | | Store refID/position, but does not store values < 0. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool setInsertSize(int insertSize) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classSamFilter.html SamFilter]</code> |
− | | Sets the inferred insert size. | + | | Class for filtering a SAM/BAM record. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool setSequence(const char* seq) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classSamFlag.html SamFlag]</code> |
− | | Sets the sequence string to the passed in string. This is a SAM formatted sequence string. Internal processing handles switching between SAM/BAM formats when read/written. | + | | Class for getting information from a SAM/BAM flag. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool setQuality(const char* quality) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classSamReferenceInfo.html SamReferenceInfo]</code> |
− | | Sets the quality string to the passed in string. This is a SAM formatted quality string. Internal processing handles switching between SAM/BAM formats when read/written.
| + | | Class for tracking the reference information mapping between the reference ids and the reference names. |
− | Returns true if successfully set, false if not.
| |
| |- | | |- |
− | | bool addTag(const char* tag, char vtype, const char* value) | + | | <code>[http://csg.sph.umich.edu//mktrost/doxygen/current/classSamTags.html SamTags]</code> |
− | | Adds a tag to the record with the specified tag, vtype, and value. Vtype can be SAM/BAM vtype. Internal processing handles switching between SAM/BAM vtypes when read/written. | + | | Class for parsing/creating/operating on SAM/BAM record tags. |
− | Returns true if successfully set, false if not.
| |
| |} | | |} |
| | | |
| + | == FAQs == |
| + | [[SAM/BAM Classes FAQs]] |
| | | |
− | == Retrieving fields from a SAM/BAM Record == | + | == Usage Examples == |
− | The SamRecord class contains accessors to access the fields of a SAM/BAM record. They assume that the class has already been populated, either by using the set commands or by calling SamFile::ReadRecord. Not all of the values that can be retrieved using these get accessors have set methods. That is because they are internally calculated values if they were not read from a file.
| + | [[Sam Library Usage Examples]] |
− | | |
− | The methods found in the SamRecord class for setting fields are:
| |
− | {| class="wikitable" style="width:100%" border="1"
| |
− | |+ style="font-size:150%"|'''SamRecord Class Get Methods'''
| |
− | ! width=""|Method Name
| |
− | ! width=""|Description
| |
− | |-
| |
− | | int getBlockSize()
| |
− | | Returns the BAM block size of the record.
| |
− | |-
| |
− | | int getReferenceID()
| |
− | | Returns the reference sequence id (BAM format).
| |
− | |-
| |
− | | int get1BasedPosition()
| |
− | | Returns the 1-based (SAM formatted) leftmost position.
| |
− | |-
| |
− | | int get0BasedPosition()
| |
− | | Returns the 0-based (BAM formatted) leftmost position.
| |
− | |-
| |
− | | int getReadNameLength()
| |
− | | Returns the length of the ReadName (QNAME).
| |
− | |-
| |
− | | int getMapQuality()
| |
− | | Returns the map quality.
| |
− | |-
| |
− | | int getBin()
| |
− | | Returns the BAM bin for the record.
| |
− | |-
| |
− | | int getCigarLength()
| |
− | | Returns the length of the CIGAR in BAM format.
| |
− | |-
| |
− | | int getFlag()
| |
− | | Returns the flag.
| |
− | |-
| |
− | | int getReadLength()
| |
− | | Returns the length of the read.
| |
− | |-
| |
− | | int getMateReferenceID()
| |
− | | Returns the mate reference sequence id (BAM format).
| |
− | |-
| |
− | | int get1BasedMatePosition()
| |
− | | Returns the 1-based (SAM formatted) mate leftmost position.
| |
− | |-
| |
− | | int get0BasedMatePosition()
| |
− | | Returns the 0-based (BAM formatted) mate leftmost position.
| |
− | |-
| |
− | | int getInsertSize()
| |
− | | Returns the insert size.
| |
− | |-
| |
− | | const char* getReadName()
| |
− | | Returns the SAM formatted Read Name (QNAME).
| |
− | |-
| |
− | | const char* getCigar()
| |
− | | Returns the SAM formatted CIGAR string.
| |
− | |-
| |
− | | const char* getSequence()
| |
− | | Returns the SAM formatted Sequence string.
| |
− | |-
| |
− | | const char* getQuality()
| |
− | | Returns the SAM formatted Quality string.
| |
− | |-
| |
− | | bool getNextSamTag(char* tag, char& vtype, void** value)
| |
− | | Returns true if a tag was read, false if there are no more tags.
| |
− | For a true return value, tag is sent to the tag of the tag, vtype is set to the vtype of the tag, and value is a pointer to the value of the tag. You will then need to use a switch to cast value to int, double, char, or String.
| |
− | |-
| |
− | | bool isIntegerType(char vtype)
| |
− | | Returns true if the passed in vtype is of integer ('c', 'C', 's', 'S', 'i', 'I') type.
| |
− | |-
| |
− | | bool isDoubleType(char vtype)
| |
− | | Returns true if the passed in vtype is of double ('f') type.
| |
− | |-
| |
− | | bool isCharType(char vtype)
| |
− | | Returns true if the passed in vtype is of char ('A') type.
| |
− | |-
| |
− | | bool isStringType(char vtype)
| |
− | | Returns true if the passed in vtype is of String ('Z') type.
| |
− | |-
| |
− | |-
| |
− | |}
| |
− | | |
| | | |
− | Example of using getNextSamTag:
| |
− | <pre>
| |
− | // record is a previously setup SamRecord.
| |
− | String recordString = "";
| |
− | char tag[3];
| |
− | char vtype;
| |
− | void* value;
| |
| | | |
− | // While there are more tags, write them to the recordString.
| + | == Programs == |
− | while(record.getNextSamTag(tag, vtype, &value) != false)
| |
− | {
| |
− | recordString += "\t";
| |
− | recordString += tag;
| |
− | recordString += ":";
| |
− | recordString += vtype;
| |
− | recordString += ":";
| |
− | if(record.isIntegerType(vtype))
| |
− | {
| |
− | recordString += (int)*(int*)value;
| |
− | }
| |
− | else if(record.isDoubleType(vtype))
| |
− | {
| |
− | recordString += (double)*(double*)value;
| |
− | }
| |
− | else if(record.isCharType(vtype))
| |
− | {
| |
− | recordString += (char)*(char*)value;
| |
− | }
| |
− | else
| |
− | {
| |
− | // String type.
| |
− | recordString += (String)*(String*)value;
| |
− | }
| |
− | }
| |
| | | |
− | recordString += "\n";
| + | BamUtil contains a set of programs that uses this library to operate on SAM & BAM files. It includes tools for converting between SAM & BAM and validating the files. See [[BamUtil]] for more information and a description of all the tools. |
− | </pre>
| |