00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #ifndef _GENOME_SEQUENCE_H
00019 #define _GENOME_SEQUENCE_H
00020
00021 #include <assert.h>
00022 #include <fcntl.h>
00023 #include <stdio.h>
00024 #include <stddef.h>
00025 #include <sys/types.h>
00026 #include <sys/stat.h>
00027 #if defined(WIN32)
00028 #define MD5_DIGEST_LENGTH 16
00029 #else
00030 #include <openssl/md5.h>
00031 #endif
00032 #include <string>
00033 #include <vector>
00034 #include "MemoryMapArray.h"
00035
00036
00037 #include "StringArray.h"
00038
00039 #ifndef __STDC_LIMIT_MACROS
00040 #define __STDC_LIMIT_MACROS
00041 #endif
00042 #include <stdint.h>
00043
00044 typedef uint32_t genomeIndex_t;
00045 #define INVALID_GENOME_INDEX UINT32_MAX
00046
00047
00048 #define INVALID_CHROMOSOME_INDEX -1
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061 struct ChromosomeInfo
00062 {
00063 static const int MAX_GENOME_INFO_STRING=128;
00064
00065 void constructorClear()
00066 {
00067 memset(this,0, sizeof(*this));
00068 }
00069 void setChromosomeName(const char *n)
00070 {
00071 strncpy(name, n, sizeof(name)-1);
00072 }
00073 genomeIndex_t start;
00074 genomeIndex_t size;
00075 char md5[2*MD5_DIGEST_LENGTH + 1];
00076 char name[MAX_GENOME_INFO_STRING];
00077 char assemblyID[MAX_GENOME_INFO_STRING];
00078 char uri[MAX_GENOME_INFO_STRING];
00079 char species[MAX_GENOME_INFO_STRING];
00080
00081
00082 void setAssemblyID(const char *newID)
00083 {
00084 strncpy(assemblyID, newID, sizeof(assemblyID) - 1);
00085 }
00086 void setSpecies(const char *newSpecies)
00087 {
00088 strncpy(species, newSpecies, sizeof(newSpecies));
00089 }
00090 void setURI(const char *newURI)
00091 {
00092 strncpy(uri, newURI, sizeof(uri));
00093 }
00094 };
00095
00096 class genomeSequenceMmapHeader : public MemoryMapArrayHeader
00097 {
00098 friend class GenomeSequence;
00099 friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
00100 private:
00101 uint32_t _chromosomeCount;
00102 bool _colorSpace;
00103
00104 ChromosomeInfo _chromosomes[0];
00105
00106 public:
00107
00108
00109
00110
00111
00112 static size_t getHeaderSize(int chromosomeCount)
00113 {
00114 return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
00115 }
00116
00117
00118
00119
00120 };
00121
00122 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
00123
00124
00125
00126
00127
00128
00129
00130 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
00131 {
00132 if ((index&1)==0)
00133 {
00134 return ((uint8_t *) base)[index>>1] & 0xf;
00135 }
00136 else
00137 {
00138 return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
00139 }
00140 };
00141 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
00142 {
00143 if ((index&1)==0)
00144 {
00145 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
00146 }
00147 else
00148 {
00149 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
00150 }
00151 }
00152
00153 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
00154 {
00155 return sizeof(uint8_t) * i / 2;
00156 }
00157
00158 #define UMFA_COOKIE 0x1b7933a1 // unique cookie id
00159 #define UMFA_VERSION 20100401U // YYYYMMDD of last change to file layout
00160
00161 typedef MemoryMapArray<
00162 uint8_t,
00163 genomeIndex_t,
00164 UMFA_COOKIE,
00165 UMFA_VERSION,
00166 genomeSequenceAccess,
00167 genomeSequenceSet,
00168 mmapGenomeSequenceElementCount2Bytes,
00169 genomeSequenceMmapHeader
00170 > genomeSequenceArray;
00171
00172 class PackedRead
00173 {
00174 void set(int index, int val)
00175 {
00176 packedBases[index>>1] =
00177 (packedBases[index>>1]
00178 & ~(7<<((index&0x01)<<2)))
00179 | ((val&0x0f)<<((index&0x1)<<2));
00180 }
00181 public:
00182 std::vector<uint8_t> packedBases;
00183 uint32_t length;
00184 int size()
00185 {
00186 return length;
00187 }
00188 void clear()
00189 {
00190 packedBases.clear();
00191 length=0;
00192 }
00193 void set(const char *rhs, int padWithNCount = 0);
00194 uint8_t operator [](int index)
00195 {
00196 return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00197 }
00198 };
00199
00200
00201
00202
00204
00235 class GenomeSequence : public genomeSequenceArray
00236 {
00237 public:
00238 static const int baseAIndex = 000;
00239 static const int baseTIndex = 001;
00240 static const int baseCIndex = 002;
00241 static const int baseGIndex = 003;
00242 static const int baseNIndex = 004;
00243 static const int baseXIndex = 005;
00244
00245
00246
00247
00248
00249 static unsigned char base2int[];
00250 static const char int2base[];
00251 static const char int2colorSpace[];
00252 static unsigned char base2complement[];
00253
00254 private:
00255 int _debugFlag;
00256
00257 std::ostream *_progressStream;
00258 bool _colorSpace;
00259
00260 std::string _baseFilename;
00261 std::string _referenceFilename;
00262 std::string _fastaFilename;
00263 std::string _umfaFilename;
00264 std::string _application;
00265
00266 MemoryMap _umfaFile;
00267
00268 void setup(const char *referenceFilename);
00269
00270 public:
00272 GenomeSequence();
00273 void constructorClear();
00278 GenomeSequence(std::string &referenceFilename)
00279 {
00280 constructorClear();
00281 setup(referenceFilename.c_str());
00282 }
00283
00288 GenomeSequence(const char *referenceFilename)
00289 {
00290 constructorClear();
00291 setup(referenceFilename);
00292 }
00293
00295 ~GenomeSequence();
00296
00302 bool open(bool isColorSpace = false, int flags = O_RDONLY);
00303
00309 bool open(const char *filename, int flags = O_RDONLY)
00310 {
00311 _umfaFilename = filename;
00312 return genomeSequenceArray::open(filename, flags);
00313 }
00314
00316 private:
00317 bool _searchCommonFileSuffix;
00318 public:
00319 bool create();
00320
00321
00322 bool create(bool isColor) {setColorSpace(isColor); return create(); }
00323
00324
00325
00326
00327 void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
00328 void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
00329 void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
00330
00331 bool loadFastaData(const char *filename);
00332
00338 bool setReferenceName(std::string referenceFilename);
00339
00343 void setApplication(std::string application)
00344 {
00345 _application = application;
00346 }
00347 const std::string &getFastaName()
00348 {
00349 return _fastaFilename;
00350 }
00351 const std::string &getReferenceName()
00352 {
00353 return _referenceFilename;
00354 }
00355
00358 bool isColorSpace()
00359 {
00360 return _colorSpace;
00361 }
00362
00365 genomeIndex_t getNumberBases()
00366 {
00367 return getElementCount();
00368 }
00369
00377 int getChromosome(genomeIndex_t position);
00385 int getChromosome(const char *chromosomeName);
00388 int getChromosomeCount();
00389
00394 genomeIndex_t getChromosomeStart(int chromosomeIndex)
00395 {
00396 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00397 return header->_chromosomes[chromosomeIndex].start;
00398 }
00399
00404 genomeIndex_t getChromosomeSize(int chromosomeIndex)
00405 {
00406 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
00407 return header->_chromosomes[chromosomeIndex].size;
00408 }
00409
00415 genomeIndex_t getGenomePosition(
00416 const char *chromosomeName,
00417 unsigned int chromosomeIndex);
00418
00424 genomeIndex_t getGenomePosition(
00425 int chromosome,
00426 unsigned int chromosomeIndex);
00427
00430 genomeIndex_t getGenomePosition(const char *chromosomeName);
00431 genomeIndex_t getGenomePosition(int chromosomeIndex);
00432
00433 const std::string &getBaseFilename()
00434 {
00435 return _baseFilename;
00436 }
00437 const char *getChromosomeName(int chromosomeIndex)
00438 {
00439 return header->_chromosomes[chromosomeIndex].name;
00440 }
00441
00442 void setDebugFlag(bool d)
00443 {
00444 _debugFlag = d;
00445 }
00446
00447 genomeIndex_t sequenceLength()
00448 {
00449 return (genomeIndex_t) header->elementCount;
00450 }
00451 const char *chromosomeName(int chr)
00452 {
00453 return header->_chromosomes[chr].name;
00454 }
00455
00456 void sanityCheck(MemoryMap &fasta);
00457 std::string IntegerToSeq(unsigned int n, unsigned int wordsize);
00458
00459 bool wordMatch(unsigned int index, std::string &word);
00460 bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word);
00461 char BasePair(char c)
00462 {
00463 return base2complement[(int) c];
00464 }
00465 void dumpSequenceSAMDictionary(std::ostream&);
00466 void dumpHeaderTSV(std::ostream&);
00467 void loadHeaderFromTSV(std::istream&);
00468
00483
00484
00485
00486
00487
00488
00489 #if 0
00490
00491
00492
00493
00494
00495
00496
00497 inline char operator[](genomeIndex_t index)
00498 {
00499 return int2base[(*((genomeSequenceArray*) this))[index]];
00500 }
00501 #endif
00502
00503 inline char operator[](genomeIndex_t index)
00504 {
00505 uint8_t val;
00506 if (index < getNumberBases())
00507 {
00508 if ((index&1)==0)
00509 {
00510 val = ((uint8_t *) data)[index>>1] & 0xf;
00511 }
00512 else
00513 {
00514 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
00515 }
00516 }
00517 else
00518 {
00519 val = baseNIndex;
00520 }
00521 val = isColorSpace() ? int2colorSpace[val] : int2base[val];
00522 return val;
00523 }
00524
00525 inline uint8_t getInteger(genomeIndex_t index)
00526 {
00527 return (*((genomeSequenceArray*) this))[index];
00528 }
00529 inline void set(genomeIndex_t index, char value)
00530 {
00531 genomeSequenceArray::set(index, base2int[(uint8_t) value]);
00532 }
00542 uint8_t *getDataPtr(genomeIndex_t index)
00543 {
00544 return ((uint8_t *) data + index/2);
00545 }
00546 private:
00552 bool setChromosomeMD5andLength(uint32_t whichChromosome);
00553 public:
00554
00555 void getReverseRead(std::string &read);
00556 void getReverseRead(String& read);
00557
00558 int debugPrintReadValidation(
00559 std::string &read,
00560 std::string &quality,
00561 char direction,
00562 genomeIndex_t readLocation,
00563 int sumQuality,
00564 int mismatchCount,
00565 bool recurse = true
00566 );
00567
00568 void getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount);
00569 void getString(String &str, int chromosome, genomeIndex_t index, int baseCount);
00570
00571
00572
00573
00574
00575 void getString(std::string &str, genomeIndex_t index, int baseCount);
00576 void getString(String &str, genomeIndex_t index, int baseCount);
00577
00578 void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd);
00579
00580 void print30(genomeIndex_t);
00581
00582
00583 genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize);
00584
00592 int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0')
00593 {
00594 int mismatchCount = 0;
00595 for (uint32_t i=0; i<read.size(); i++)
00596 if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
00597 return mismatchCount;
00598 };
00599
00605 int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location)
00606 {
00607 int sumQ = 0;
00608 for (uint32_t i=0; i<read.size(); i++)
00609 sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
00610 return sumQ;
00611 };
00612
00613 void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location);
00614 void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location);
00615
00616 void getChromosomeAndIndex(std::string &, genomeIndex_t);
00617 void getChromosomeAndIndex(String &, genomeIndex_t);
00618
00630 bool checkRead(
00631 std::string &read,
00632 std::string &qualities,
00633 std::string &cigar,
00634 int &sumQ,
00635 int &gapOpenCount,
00636 int &gapExtendCount,
00637 int &gapDeleteCount,
00638 std::string &result
00639 );
00640
00641 bool populateDBSNP(mmapArrayBool_t &dbSNP, std::ifstream &inputFile);
00642
00657 bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName);
00658 };
00659
00660 #endif