00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __SAM_FILE_HEADER_H__ 00019 #define __SAM_FILE_HEADER_H__ 00020 00021 #include <map> 00022 #include <stdint.h> 00023 00024 #include "SamReferenceInfo.h" 00025 #include "SamHeaderHD.h" 00026 #include "SamHeaderSQ.h" 00027 #include "SamHeaderRG.h" 00028 #include "SamHeaderPG.h" 00029 #include "SamStatus.h" 00030 00031 class SamFileHeader 00032 { 00033 public: 00034 SamFileHeader(); 00035 ~SamFileHeader(); 00036 00037 // Copy Constructor 00038 SamFileHeader(const SamFileHeader& header); 00039 00040 // Overload operator = to copy the passed in header into this header. 00041 SamFileHeader & operator = (const SamFileHeader& header); 00042 00043 // Overload operator = to copy the passed in header into this header. 00044 bool copy(const SamFileHeader& header); 00045 00046 void resetHeader(); 00047 00048 // Set the passed in string to the entire header string. Clearing its 00049 // current contents. 00050 // Return true if successfully set (even if set to "") 00051 bool getHeaderString(std::string& header) const; 00052 00053 int getReferenceID(const String & referenceName); 00054 int getReferenceID(const char* referenceName); 00055 const String & getReferenceLabel(int id) const; 00056 00057 // Get the Reference Information 00058 const SamReferenceInfo* getReferenceInfo() const; 00059 00060 // Add reference sequence name and reference sequence length to the header. 00061 void addReferenceInfo(const char* referenceSequenceName, 00062 int32_t referenceSequenceLength); 00063 00065 // Set Values in the header 00067 00068 // Add a header line that is just one tag with a const char* value. 00069 bool addHeaderLine(const char* type, const char* tag, const char* value); 00070 // Add a header line that is already preformatted in a const char*. 00071 // It is assumed that the line does not contain a \n. 00072 bool addHeaderLine(const char* headerLine); 00073 00074 // // Set the specified header type tag to the specified value in the 00075 // // header with the specified keyID. keyID must be specified when 00076 // // type = SQ, RG, or PG. 00077 // bool setTag(SamHeaderRecord::SamHeaderRecordType type, const char* tag, 00078 // const char* value, const char* keyID = NULL); 00079 00080 // Set the specified tag to the specified value in the HD header. 00081 bool setHDTag(const char* tag, const char* value); 00082 00083 // Set the specified tag to the specified value in the SQ header with 00084 // the specified name. 00085 // If the header does not yet exist, the header is added. 00086 bool setSQTag(const char* tag, const char* value, const char* name); 00087 00088 // Set the specified tag to the specified value in the RG header with 00089 // the read group identifier. 00090 // If the header does not yet exist, the header is added. 00091 bool setRGTag(const char* tag, const char* value, const char* id); 00092 00093 // Set the specified tag to the specified value in the PG header with 00094 // the specified id. 00095 // If the header does not yet exist, the header is added. 00096 bool setPGTag(const char* tag, const char* value, const char* id); 00097 00098 // Add the HD record to the header. 00099 // Note: it adds a pointer to the passed in header record. The header 00100 // record will be deleted when it is cleaned up from this header. 00101 bool addHD(SamHeaderHD* hd); 00102 00103 // Add the SQ record to the header. 00104 // Note: it adds a pointer to the passed in header record. The header 00105 // record will be deleted when it is cleaned up from this header. 00106 bool addSQ(SamHeaderSQ* sq); 00107 00108 // Add the RG record to the header. 00109 // Note: it adds a pointer to the passed in header record. The header 00110 // record will be deleted when it is cleaned up from this header. 00111 bool addRG(SamHeaderRG* rg); 00112 00113 // Add the PG record to the header. 00114 // Note: it adds a pointer to the passed in header record. The header 00115 // record will be deleted when it is cleaned up from this header. 00116 bool addPG(SamHeaderPG* pg); 00117 00119 // Remove entries from the header 00121 bool removeHD(); // Remove the HD record. 00122 bool removeSQ(const char* name); // Remove SQ record with the specified key. 00123 bool removeRG(const char* id); // Remove RG record with the specified key. 00124 bool removePG(const char* id); // Remove PG record with the specified key. 00125 00126 00128 // 00130 SamStatus::Status setHeaderFromBamFile(IFILE filePtr); 00131 00132 const char* getHDTagValue(const char* tag); 00133 // Get the value associated with the specified tag on the SQ line with 00134 // the specified sequence name. 00135 const char* getSQTagValue(const char* tag, const char* name); 00136 // Get the value associated with the specified tag on the RG line with 00137 // the specified read group identifier. 00138 const char* getRGTagValue(const char* tag, const char* id); 00139 // Get the value associated with the specified tag on the RG line with 00140 // the specified id. 00141 const char* getPGTagValue(const char* tag, const char* id); 00142 00143 // Get the number of SQ objects. 00144 int getNumSQs(); 00145 00146 // Get the number of RG objects. 00147 int getNumRGs(); 00148 00149 // Get the number of PG objects. 00150 int getNumPGs(); 00151 00152 // Get the HD object. 00153 SamHeaderHD* getHD(); 00154 00155 // Get the SQ object with the specified sequence name. 00156 SamHeaderSQ* getSQ(const char* name); 00157 00158 // Get the RG object with the specified read group identifier. 00159 SamHeaderRG* getRG(const char* id); 00160 00161 // Get the PG object with the specified id. 00162 SamHeaderPG* getPG(const char* id); 00163 00164 // ////////////////////////////////// 00165 // // Set methods for header fields. 00166 // bool setVersion(const char* version); 00167 // bool setSortOrder(const char* sortOrder); 00168 // bool addSequenceName(const char* sequenceName); 00169 // bool setSequenceLength(const char* keyID, int sequenceLength); 00170 // bool setGenomeAssemblyId(const char* keyID, const char* genomeAssemblyId); 00171 // bool setMD5Checksum(const char* keyID, const char* md5sum); 00172 // bool setURI(const char* keyID, const char* uri); 00173 // bool setSpecies(const char* keyID, const char* species); 00174 // bool addReadGroupID(const char* readGroupID); 00175 // bool setSample(const char* keyID, const char* sample); 00176 // bool setLibrary(const char* keyID, const char* library); 00177 // bool setDescription(const char* keyID, const char* description); 00178 // bool setPlatformUnit(const char* keyID, const char* platform); 00179 // bool setPredictedMedianInsertSize(const char* keyID, const char* isize); 00180 // bool setSequencingCenter(const char* keyID, const char* center); 00181 // bool setRunDate(const char* keyID, const char* runDate); 00182 // bool setTechnology(const char* keyID, const char* technology); 00183 // bool addProgram(const char* programID); 00184 // bool setProgramVersion(const char* keyID, const char* version); 00185 // bool setCommandLine(const char* keyID, const char* commandLine); 00186 00187 // /////////////////////////////////// 00188 // // Get methods for header fields. 00189 // // Returns the number of SQ entries in the header. 00190 // int32_t getSequenceDictionaryCount(); 00191 // Return the Sort Order value that is set in the Header. 00192 // If this field does not exist, "" is returned. 00193 const char* getSortOrder(); 00194 00195 00196 // DEPRECATED 00197 const char* getTagSO(); 00198 00199 // Get the next SQ header record. After all SQ headers have been retrieved, 00200 // NULL is returned until a reset is called. 00201 SamHeaderRecord* getNextSQRecord(); 00202 00203 // Get the next RG header record. After all RG headers have been retrieved, 00204 // NULL is returned until a reset is called. 00205 SamHeaderRecord* getNextRGRecord(); 00206 00207 // Get the next PG header record. After all PG headers have been retrieved, 00208 // NULL is returned until a reset is called. 00209 SamHeaderRecord* getNextPGRecord(); 00210 00211 // Reset to the beginning of the header records so the next call 00212 // to getNextSQRecord returns the first SQ header record. 00213 void resetSQRecordIter(); 00214 00215 // Reset to the beginning of the header records so the next call 00216 // to getNextRGRecord returns the first RG header record. 00217 void resetRGRecordIter(); 00218 00219 // Reset to the beginning of the header records so the next call 00220 // to getNextPGRecord returns the first PG header record. 00221 void resetPGRecordIter(); 00222 00223 // Get the next header record of the specified type. 00224 // Pass in the index to start looking at and the type to look for. 00225 // Update the index. 00226 // After all headers of that type have been retrieved, 00227 // NULL is returned until a reset is called for that type. 00228 SamHeaderRecord* getNextHeaderRecord(uint32_t& index, 00229 SamHeaderRecord::SamHeaderRecordType headerType); 00230 00231 // Get the next header record. After all headers have been retrieved, 00232 // NULL is returned until a reset is called. Does not return the 00233 // Comment lines. 00234 // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the 00235 // same iterator. 00236 SamHeaderRecord* getNextHeaderRecord(); 00237 00238 00239 // Set the passed in string to the next header line. The passed in 00240 // string will be overwritten. If there are no more header lines or there 00241 // is an error, false is returned and the passed in string is set to "" 00242 // until a rest is called. 00243 // Will also return the comment lines. 00244 // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the 00245 // same iterator. 00246 bool getNextHeaderLine(std::string &headerLine); 00247 00248 // Reset to the beginning of the header records so the next call 00249 // to getNextHeaderRecord returns the first header line. 00250 void resetHeaderRecordIter(); 00251 00252 // Returns the comment on the next comment line. Returns "" if all comment 00253 // lines have been returned, until resetCommentIter is called. 00254 const char* getNextComment(); 00255 00256 // Resets to the beginning of the comments so getNextComment returns 00257 // the first comment. 00258 void resetCommentIter(); 00259 00260 // Add a comment. 00261 bool addComment(const char* comment); 00262 00263 // Populate the reference info from the SQ fields. 00264 void generateReferenceInfo(); 00265 00266 00267 private: 00268 // Parse the header string. 00269 bool parseHeader(String& header); 00270 00271 // Parse the specified line of the header. 00272 bool parseHeaderLine(const String& headerLine); 00273 00274 // Set the passed in string to the header line at the specified index. 00275 // It does NOT clear the current contents of header. 00276 bool getHeaderLine(unsigned int index, std::string& header) const; 00277 00278 int16_t makeKey(char ch1, char ch2) 00279 { 00280 return((ch1 << 8) + ch2); 00281 } 00282 00283 // Only one HD type is allowed per file. 00284 SamHeaderHD* myHD; 00285 00286 // There can be multiple SQ Types, indexed by SN. 00287 StringHash mySQs; 00288 00289 // There can be multiple RG Types, indexed by ID. 00290 StringHash myRGs; 00291 00292 // There can be multiple PG types, indexed by ID. 00293 StringHash myPGs; 00294 00295 // Reference Name information 00296 SamReferenceInfo myReferenceInfo; 00297 00298 // Vector of comments 00299 std::vector<std::string> myComments; 00300 00301 std::vector<SamHeaderRecord*> myHeaderRecords; 00302 00303 uint32_t myCurrentSQIndex; 00304 00305 uint32_t myCurrentRGIndex; 00306 00307 uint32_t myCurrentPGIndex; 00308 00309 uint32_t myCurrentHeaderIndex; 00310 00311 uint32_t myCurrentCommentIndex; 00312 00313 static const std::string EMPTY_RETURN; 00314 }; 00315 00316 #endif 00317