FastQFile.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __FASTQ_VALIDATOR_H__
00019 #define __FASTQ_VALIDATOR_H__
00020 
00021 #include <iostream>
00022 #include <map>
00023 #include "StringBasics.h"
00024 #include "InputFile.h"
00025 #include "BaseComposition.h"
00026 #include "FastQStatus.h"
00027 
00028 class FastQFile
00029 {
00030  public:
00031    // Constructor.
00032    // minReadLength - The minimum length that a base sequence must be for
00033    //                 it to be valid.
00034    // numPrintableErrors - The maximum number of errors that should be reported
00035    //                      in detail before suppressing the errors.
00036    FastQFile(int minReadLength = 10, int numPrintableErrors = 20);
00037 
00038    // Disable messages - do not write to cout.
00039    void disableMessages();
00040 
00041    // Enable messages - write to cout.
00042    void enableMessages();
00043 
00044 
00045    // Set the number of errors after which to quit reading/validating a file.
00046    // Defaults to -1.
00047    //   -1 indicates to not quit until the entire file has been read/validated.
00048    //    0 indicates to quit without reading/validating anything.
00049    void setMaxErrors(int maxErrors);
00050 
00051    // Open a FastQFile.
00052    // If baseLetter is specified to be non-"", then it will be used to
00053    // set the base sequence for this file.  If the letter is in base-space, that
00054    // will be used.  If it is in color-space, that will be used, if it is in
00055    // neither, then both are allowed.  If it is blank, then the first 
00056    // character of the sequence will be used to set the space type.
00057    FastQStatus::Status openFile(const char* fileName,
00058                                 BaseAsciiMap::SPACE_TYPE spaceType = BaseAsciiMap::UNKNOWN);
00059    
00060    // Close a FastQFile.
00061    FastQStatus::Status closeFile();
00062 
00063    // Check to see if the file is open.
00064    bool isOpen();
00065 
00066    // Check to see if the file is at the end of the file.
00067    bool isEof();
00068    
00069    // Returns whether or not to keep reading the file.
00070    // Stop reading (false) if eof or there is a problem reading the file.
00071    bool keepReadingFile();
00072       
00073    // Validate the specified fastq file
00074    // filename - fastq file to be validated.
00075    // printBaseComp - whether or not to print the base composition for the file.
00076    //                 true means print it, false means do not.
00077    // spaceType - the spaceType to use for validation - BASE_SPACE, COLOR_SPACE,
00078    //             or UNKNOWN.  UNKNOWN means to determine the spaceType to
00079    //             validate against from the first character of the first
00080    //             sequence.
00081    // Returns the fastq validation status -  SUCCESS on a successfully
00082    // validated fastq file.
00083    FastQStatus::Status validateFastQFile(const String &filename,  
00084                                          bool printBaseComp,
00085                                          BaseAsciiMap::SPACE_TYPE spaceType);
00086 
00087    // Read 1 FastQSequence, validating it.
00088    FastQStatus::Status readFastQSequence();
00089 
00090    // Keep public variables for a sequence's line so they can be accessed
00091    // without having to do string copies.
00092    String myRawSequence;
00093    String mySequenceIdLine;
00094    String mySequenceIdentifier;
00095    String myPlusLine;
00096    String myQualityString;
00097 
00098    inline BaseAsciiMap::SPACE_TYPE getSpaceType()
00099    {
00100       return(myBaseComposition.getSpaceType());
00101    }
00102 
00103  private:
00104 
00105    // Validates a single fastq sequence from myFile.
00106    bool validateFastQSequence();
00107 
00108    // Reads and validates the sequence identifier line of a fastq sequence.
00109    bool validateSequenceIdentifierLine();
00110 
00111    // Reads and validates the raw sequence line(s) and the plus line.  Both are
00112    // included in one method since it is unknown when the raw sequence line
00113    // ends until you find the plus line that divides it from the quality
00114    // string.  Since this method will read the plus line to know when the
00115    // raw sequence ends, it also validates that line.
00116    bool validateRawSequenceAndPlusLines();
00117 
00118    // Reads and validates the quality string line(s).
00119    bool validateQualityStringLines();
00120 
00121    // Method to validate a line that contains part of the raw sequence.
00122    // offset specifies where in the sequence to start validating.
00123    bool validateRawSequence(int offset);
00124 
00125    // Method to validate the "+" line that seperates the raw sequence and the
00126    // quality string.
00127    bool validateSequencePlus();
00128 
00129    // Method to validate the quality string.
00130    // offset specifies where in the quality string to start validating.
00131    bool validateQualityString(int offset);
00132 
00133    // Helper method to read a line from the input file into a string.
00134    // It also tracks the line number.
00135    void readLine();
00136 
00137    // Helper method for printing the contents of myErrorString.  It will
00138    // only print the errors until the maximum number of reportable errors is
00139    // reached.
00140    void reportErrorOnLine();
00141 
00142    // Reset the member data for each fastq file.
00143    void reset();
00144 
00145    // Reset the member data for each sequence.
00146    void resetForEachSequence();
00147 
00148    // Log the specified message if enabled.
00149    void logMessage(const char* message);
00150 
00151    // Determine if it is time to quit by checking if we are to quit after a
00152    // certain number of errors and that many errors have been encountered.
00153    bool isTimeToQuit();
00154 
00156    // Following member data elements are reset for each validated sequence.
00157    //
00158 
00159    // Buffer for storing the contents of the line read.
00160    // Stored as member data so memory allocation is only done once.
00161    String myLineBuffer;
00162 
00163    // Buffer for storing the error string.  This prevents the reallocation of
00164    // the string buffer for each error.
00165    String myErrorString;
00166 
00167    String myTempPartialQuality;
00168 
00170    // Following member data elements are reset for each validated file.
00171    //
00172    IFILE myFile; // Input file to be read.
00173    String myFileName; // Name of file being processed.
00174    int myNumErrors;   // Tracks the number of errors.
00175    uint myLineNum;    // Track the line number - used for reporting errors.
00176    BaseComposition myBaseComposition;  // Tracks the base composition.
00177 
00178    // Map to track which identifiers have appeared in the file.
00179    std::map<std::string, uint> myIdentifierMap;
00180  
00182    // Following member data do not change for each call to the validator.
00183    //
00184    int myMinReadLength; // Min Length for a read.
00185    int myNumPrintableErrors;  // Max number of errors to print the details of.
00186 
00187    // Number of errors after which to quit reading/validating a file.
00188    // Defaults to -1.
00189    //   -1 indicates to not quit until the entire file has been read/validated.
00190    //    0 indicates to quit without reading/validating anything.
00191    int myMaxErrors;
00192 
00193 
00194 
00195    // Whether or not messages should be printed.  
00196    // Defaulted to false (they should be printed).
00197    bool myDisableMessages;
00198 
00199    // Track if there is a problem reading the file.  If there are read
00200    // problems, stop reading the file.
00201    bool myFileProblem;
00202 };
00203 
00204 #endif
Generated on Tue Nov 9 16:11:49 2010 for StatGen Software by  doxygen 1.6.3