GenomeSequence.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _GENOME_SEQUENCE_H
00019 #define _GENOME_SEQUENCE_H
00020 
00021 #include <assert.h>
00022 #include <fcntl.h>
00023 #include <stdio.h>              // for size_t, which md5.h needs (for redhat)
00024 #include <stddef.h>             // for size_t, which md5.h needs (for correct systems)
00025 #include <sys/types.h>
00026 #include <sys/stat.h>
00027 #if defined(WIN32)
00028 #define MD5_DIGEST_LENGTH 16
00029 #else
00030 #include <openssl/md5.h>        // for MD5_DIGEST_LENGTH
00031 #endif
00032 #include <string>
00033 #include <vector>
00034 #include "MemoryMapArray.h"
00035 
00036 // Goncalo's String class
00037 #include "StringArray.h"
00038 
00039 #ifndef __STDC_LIMIT_MACROS
00040 #define __STDC_LIMIT_MACROS
00041 #endif
00042 #include <stdint.h>
00043 
00044 typedef uint32_t    genomeIndex_t;
00045 #define INVALID_GENOME_INDEX UINT32_MAX
00046 
00047 // chromosome index is just a signed int, so this is ok here:
00048 #define INVALID_CHROMOSOME_INDEX -1
00049 
00050 
00051 //
00052 // ChromosomeInfo represents the per chromosome information
00053 // necessary to write out SAM/BAM records.  In addition, it
00054 // contains a single internal index used to point to the vector
00055 // offset where the chromosome bases start.
00056 //
00057 // This is mildly non-optimal for larger collections of chromosomes
00058 // or contigs - one use case described having millions of contigs,
00059 // in which case, this structure alone would take a gigabyte or more.
00060 //
00061 struct ChromosomeInfo
00062 {
00063     static const int  MAX_GENOME_INFO_STRING=128;
00064 
00065     void constructorClear()
00066     {
00067         memset(this,0, sizeof(*this));
00068     }
00069     void setChromosomeName(const char *n)
00070     {
00071         strncpy(name, n, sizeof(name)-1);
00072     }
00073     genomeIndex_t   start;                              // internal offset to combined genome vector
00074     genomeIndex_t   size;                               // SAM SQ:LN value
00075     char            md5[2*MD5_DIGEST_LENGTH + 1];       // 32 chars plus NUL, SAM SQ:M5 value
00076     char            name[MAX_GENOME_INFO_STRING];       // SAM SQ:SN value
00077     char            assemblyID[MAX_GENOME_INFO_STRING]; // SAM SQ:AS value
00078     char            uri[MAX_GENOME_INFO_STRING];        // SAM SQ:UR value
00079     char            species[MAX_GENOME_INFO_STRING];    // SAM SQ:SP value
00080 
00081     // handy setting methods:
00082     void setAssemblyID(const char *newID)
00083     {
00084         strncpy(assemblyID, newID, sizeof(assemblyID) - 1);
00085     }
00086     void setSpecies(const char *newSpecies)
00087     {
00088         strncpy(species, newSpecies, sizeof(newSpecies));
00089     }
00090     void setURI(const char *newURI)
00091     {
00092         strncpy(uri, newURI, sizeof(uri));
00093     }
00094 };
00095 
00096 class genomeSequenceMmapHeader : public MemoryMapArrayHeader
00097 {
00098     friend class GenomeSequence;
00099     friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
00100 private:
00101     uint32_t    _chromosomeCount;
00102     bool        _colorSpace;
00103 
00104     ChromosomeInfo  _chromosomes[0];
00105 
00106 public:
00107     //
00108     // getHeaderSize is special in that it must not access any
00109     // member variables, since it is called when the header has
00110     // not been created yet.
00111     //
00112     static size_t  getHeaderSize(int chromosomeCount)
00113     {
00114         return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
00115     }
00116     //
00117     // below methods return TRUE if it failed, false otherwise (primarily
00118     // a length check).
00119     //
00120 };
00121 
00122 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
00123 
00124 //
00125 // define the genomeSequence array type:
00126 //
00127 // NB the access/set routines use the encoded base values in the range
00128 // 0-15, not the corresponding base pair character.
00129 //
00130 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
00131 {
00132     if ((index&1)==0)
00133     {
00134         return ((uint8_t *) base)[index>>1] & 0xf;
00135     }
00136     else
00137     {
00138         return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
00139     }
00140 };
00141 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
00142 {
00143     if ((index&1)==0)
00144     {
00145         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
00146     }
00147     else
00148     {
00149         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
00150     }
00151 }
00152 
00153 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
00154 {
00155     return sizeof(uint8_t) * i / 2;
00156 }
00157 
00158 #define UMFA_COOKIE 0x1b7933a1  // unique cookie id
00159 #define UMFA_VERSION 20100401U  // YYYYMMDD of last change to file layout
00160 
00161 typedef MemoryMapArray<
00162 uint8_t,
00163 genomeIndex_t,
00164 UMFA_COOKIE,
00165 UMFA_VERSION,
00166 genomeSequenceAccess,
00167 genomeSequenceSet,
00168 mmapGenomeSequenceElementCount2Bytes,
00169 genomeSequenceMmapHeader
00170 > genomeSequenceArray;
00171 
00172 class PackedRead
00173 {
00174     void set(int index, int val)
00175     {
00176         packedBases[index>>1] =
00177             (packedBases[index>>1]             // original value
00178              & ~(7<<((index&0x01)<<2)))         // logical AND off the original value
00179             | ((val&0x0f)<<((index&0x1)<<2));  // logical OR in the new value
00180     }
00181 public:
00182     std::vector<uint8_t> packedBases;
00183     uint32_t    length;
00184     int size()
00185     {
00186         return length;
00187     }
00188     void clear()
00189     {
00190         packedBases.clear();
00191         length=0;
00192     }
00193     void set(const char *rhs, int padWithNCount = 0);
00194     uint8_t operator [](int index)
00195     {
00196         return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00197     }
00198 };
00199 
00200 // std::string &operator = (std::string &lhs, const PackedRead &rhs);
00201 //
00202 
00204 
00235 class GenomeSequence : public genomeSequenceArray
00236 {
00237 public:
00238     static const int baseAIndex = 000;
00239     static const int baseTIndex = 001;
00240     static const int baseCIndex = 002;
00241     static const int baseGIndex = 003;
00242     static const int baseNIndex = 004; // baseN --> bad read
00243     static const int baseXIndex = 005; // baseX --> unknown (bad data)
00244     //
00245     // two arrays for converting back and forth between base pair character
00246     // value (ASCII) to a base integer in the range 0..3.  Note there is actually
00247     // a value 4 and 5, for 'N' (indelible) and 'M' (unknown to me).
00248     //
00249     static unsigned char base2int[];        // base space read (ATCG)
00250     static const char int2base[];
00251     static const char int2colorSpace[];
00252     static unsigned char base2complement[];
00253 
00254 private:
00255     int                     _debugFlag;
00256 
00257     std::ostream            *_progressStream;
00258     bool                    _colorSpace;
00259 
00260     std::string             _baseFilename;   // for later use by WordIndex create and open
00261     std::string             _referenceFilename;
00262     std::string             _fastaFilename;
00263     std::string             _umfaFilename;
00264     std::string             _application;        // only used in ::create()
00265 
00266     MemoryMap               _umfaFile;
00267 
00268     void setup(const char *referenceFilename);
00269 
00270 public:
00272     GenomeSequence();
00273     void constructorClear();
00278     GenomeSequence(std::string &referenceFilename)
00279     {
00280         constructorClear();
00281         setup(referenceFilename.c_str());
00282     }
00283 
00288     GenomeSequence(const char *referenceFilename)
00289     {
00290         constructorClear();
00291         setup(referenceFilename);
00292     }
00293 
00295     ~GenomeSequence();
00296 
00302     bool open(bool isColorSpace = false, int flags = O_RDONLY);
00303 
00309     bool open(const char *filename, int flags = O_RDONLY)
00310     {
00311         _umfaFilename = filename;
00312         return genomeSequenceArray::open(filename, flags);
00313     }
00314 
00316 private:
00317     bool    _searchCommonFileSuffix;
00318 public:
00319     bool create();
00320     //
00321     // backwards compatability:
00322     bool create(bool isColor) {setColorSpace(isColor); return create(); }
00323 
00324     // NEW API?
00325 
00326     // load time modifiers:
00327     void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
00328     void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
00329     void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
00330 
00331     bool loadFastaData(const char *filename);
00332 
00338     bool setReferenceName(std::string referenceFilename);
00339 
00343     void setApplication(std::string application)
00344     {
00345         _application = application;     // used in ::create() to set application name
00346     }
00347     const std::string &getFastaName()
00348     {
00349         return _fastaFilename;
00350     }
00351     const std::string &getReferenceName()
00352     {
00353         return _referenceFilename;
00354     }
00355 
00358     bool isColorSpace()
00359     {
00360         return _colorSpace;
00361     }
00362 
00365     genomeIndex_t   getNumberBases()
00366     {
00367         return getElementCount();
00368     }
00369 
00377     int getChromosome(genomeIndex_t position);
00385     int getChromosome(const char *chromosomeName);
00388     int getChromosomeCount();
00389 
00394     genomeIndex_t getChromosomeStart(int chromosomeIndex)
00395     {
00396         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00397         return header->_chromosomes[chromosomeIndex].start;
00398     }
00399 
00404     genomeIndex_t getChromosomeSize(int chromosomeIndex)
00405     {
00406         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
00407         return header->_chromosomes[chromosomeIndex].size;
00408     }
00409 
00415     genomeIndex_t getGenomePosition(
00416         const char *chromosomeName,
00417         unsigned int chromosomeIndex);
00418 
00424     genomeIndex_t getGenomePosition(
00425         int chromosome,
00426         unsigned int chromosomeIndex);
00427 
00430     genomeIndex_t getGenomePosition(const char *chromosomeName);
00431     genomeIndex_t getGenomePosition(int chromosomeIndex);
00432 
00433     const std::string &getBaseFilename()
00434     {
00435         return _baseFilename;
00436     }
00437     const char *getChromosomeName(int chromosomeIndex)
00438     {
00439         return header->_chromosomes[chromosomeIndex].name;
00440     }
00441 
00442     void setDebugFlag(bool d)
00443     {
00444         _debugFlag = d;
00445     }
00446 
00447     genomeIndex_t sequenceLength()
00448     {
00449         return (genomeIndex_t) header->elementCount;
00450     }
00451     const char *chromosomeName(int chr)
00452     {
00453         return header->_chromosomes[chr].name;
00454     }
00455 
00456     void sanityCheck(MemoryMap &fasta);
00457     std::string IntegerToSeq(unsigned int n, unsigned int wordsize);
00458 
00459     bool wordMatch(unsigned int index, std::string &word);
00460     bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word);
00461     char BasePair(char c)
00462     {
00463         return base2complement[(int) c];
00464     }
00465     void dumpSequenceSAMDictionary(std::ostream&);
00466     void dumpHeaderTSV(std::ostream&);
00467     void loadHeaderFromTSV(std::istream&);
00468 
00483 
00484     //
00485     // 3.5% improvement for color space matching:
00486     // I guess the compiler isn't inlining 3 functions deep.
00487     //
00488 
00489 #if 0
00490     // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
00491     // The following code does not work even in the base space,
00492     // since the memory layout has changed. USE IT WITH CAUTIOUS!
00493     //
00494     // This block of code is a functional duplicate of the following
00495     // code - leave this here for reference and possibly later
00496     // performance testing as well as compiler evaluation.
00497     inline char operator[](genomeIndex_t index)
00498     {
00499         return int2base[(*((genomeSequenceArray*) this))[index]];
00500     }
00501 #endif
00502 
00503     inline char operator[](genomeIndex_t index)
00504     {
00505         uint8_t val;
00506         if (index < getNumberBases())
00507         {
00508             if ((index&1)==0)
00509             {
00510                 val = ((uint8_t *) data)[index>>1] & 0xf;
00511             }
00512             else
00513             {
00514                 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
00515             }
00516         }
00517         else
00518         {
00519             val = baseNIndex;
00520         }
00521         val = isColorSpace() ? int2colorSpace[val] : int2base[val];
00522         return val;
00523     }
00524 
00525     inline uint8_t getInteger(genomeIndex_t index)
00526     {
00527         return (*((genomeSequenceArray*) this))[index];
00528     }
00529     inline void set(genomeIndex_t index, char value)
00530     {
00531         genomeSequenceArray::set(index, base2int[(uint8_t) value]);
00532     }
00542     uint8_t *getDataPtr(genomeIndex_t index)
00543     {
00544         return ((uint8_t *) data + index/2);
00545     }
00546 private:
00552     bool setChromosomeMD5andLength(uint32_t whichChromosome);
00553 public:
00554     // replace read with the reversed one
00555     void getReverseRead(std::string &read);
00556     void getReverseRead(String& read);
00557     // debug the given read - print nice results
00558     int debugPrintReadValidation(
00559         std::string &read,
00560         std::string &quality,
00561         char   direction,
00562         genomeIndex_t   readLocation,
00563         int sumQuality,
00564         int mismatchCount,
00565         bool recurse = true
00566     );
00567 
00568     void getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount);
00569     void getString(String &str, int chromosome, genomeIndex_t index, int baseCount);
00570     //
00571     // get the sequence from this GenomeSequence.
00572     // if baseCount < 0, get the reverse complement
00573     // that starts at index (but do not reverse the string?)
00574     //
00575     void getString(std::string &str, genomeIndex_t index, int baseCount);
00576     void getString(String &str, genomeIndex_t index, int baseCount);
00577 
00578     void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd);
00579 
00580     void print30(genomeIndex_t);
00581 
00582     // for debugging, not for speed:
00583     genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize);
00584 
00592     int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0')
00593     {
00594         int mismatchCount = 0;
00595         for (uint32_t i=0; i<read.size(); i++)
00596             if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
00597         return mismatchCount;
00598     };
00599 
00605     int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location)
00606     {
00607         int sumQ = 0;
00608         for (uint32_t i=0; i<read.size(); i++)
00609             sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
00610         return sumQ;
00611     };
00612     // return a string highlighting mismatch postions with '^' chars:
00613     void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location);
00614     void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location);
00615 
00616     void getChromosomeAndIndex(std::string &, genomeIndex_t);
00617     void getChromosomeAndIndex(String &, genomeIndex_t);
00618 
00630     bool checkRead(
00631         std::string &read,
00632         std::string &qualities,
00633         std::string &cigar,
00634         int &sumQ,              // input and output
00635         int &gapOpenCount,      // output only
00636         int &gapExtendCount,    // output only
00637         int &gapDeleteCount,    // output only
00638         std::string &result
00639     );
00640 
00641     bool populateDBSNP(mmapArrayBool_t &dbSNP, std::ifstream &inputFile);
00642 
00657     bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName);
00658 };
00659 
00660 #endif
Generated on Tue Nov 9 16:11:49 2010 for StatGen Software by  doxygen 1.6.3