00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __FASTQ_VALIDATOR_H__ 00019 #define __FASTQ_VALIDATOR_H__ 00020 00021 #include <iostream> 00022 #include <map> 00023 #include "StringBasics.h" 00024 #include "InputFile.h" 00025 #include "BaseComposition.h" 00026 #include "FastQStatus.h" 00027 00028 class FastQFile 00029 { 00030 public: 00031 // Constructor. 00032 // minReadLength - The minimum length that a base sequence must be for 00033 // it to be valid. 00034 // numPrintableErrors - The maximum number of errors that should be reported 00035 // in detail before suppressing the errors. 00036 FastQFile(int minReadLength = 10, int numPrintableErrors = 20); 00037 00038 // Disable messages - do not write to cout. 00039 void disableMessages(); 00040 00041 // Enable messages - write to cout. 00042 void enableMessages(); 00043 00044 00045 // Set the number of errors after which to quit reading/validating a file. 00046 // Defaults to -1. 00047 // -1 indicates to not quit until the entire file has been read/validated. 00048 // 0 indicates to quit without reading/validating anything. 00049 void setMaxErrors(int maxErrors); 00050 00051 // Open a FastQFile. 00052 // If baseLetter is specified to be non-"", then it will be used to 00053 // set the base sequence for this file. If the letter is in base-space, that 00054 // will be used. If it is in color-space, that will be used, if it is in 00055 // neither, then both are allowed. If it is blank, then the first 00056 // character of the sequence will be used to set the space type. 00057 FastQStatus::Status openFile(const char* fileName, 00058 BaseAsciiMap::SPACE_TYPE spaceType = BaseAsciiMap::UNKNOWN); 00059 00060 // Close a FastQFile. 00061 FastQStatus::Status closeFile(); 00062 00063 // Check to see if the file is open. 00064 bool isOpen(); 00065 00066 // Check to see if the file is at the end of the file. 00067 bool isEof(); 00068 00069 // Returns whether or not to keep reading the file. 00070 // Stop reading (false) if eof or there is a problem reading the file. 00071 bool keepReadingFile(); 00072 00073 // Validate the specified fastq file 00074 // filename - fastq file to be validated. 00075 // printBaseComp - whether or not to print the base composition for the file. 00076 // true means print it, false means do not. 00077 // spaceType - the spaceType to use for validation - BASE_SPACE, COLOR_SPACE, 00078 // or UNKNOWN. UNKNOWN means to determine the spaceType to 00079 // validate against from the first character of the first 00080 // sequence. 00081 // Returns the fastq validation status - SUCCESS on a successfully 00082 // validated fastq file. 00083 FastQStatus::Status validateFastQFile(const String &filename, 00084 bool printBaseComp, 00085 BaseAsciiMap::SPACE_TYPE spaceType); 00086 00087 // Read 1 FastQSequence, validating it. 00088 FastQStatus::Status readFastQSequence(); 00089 00090 // Keep public variables for a sequence's line so they can be accessed 00091 // without having to do string copies. 00092 String myRawSequence; 00093 String mySequenceIdLine; 00094 String mySequenceIdentifier; 00095 String myPlusLine; 00096 String myQualityString; 00097 00098 inline BaseAsciiMap::SPACE_TYPE getSpaceType() 00099 { 00100 return(myBaseComposition.getSpaceType()); 00101 } 00102 00103 private: 00104 00105 // Validates a single fastq sequence from myFile. 00106 bool validateFastQSequence(); 00107 00108 // Reads and validates the sequence identifier line of a fastq sequence. 00109 bool validateSequenceIdentifierLine(); 00110 00111 // Reads and validates the raw sequence line(s) and the plus line. Both are 00112 // included in one method since it is unknown when the raw sequence line 00113 // ends until you find the plus line that divides it from the quality 00114 // string. Since this method will read the plus line to know when the 00115 // raw sequence ends, it also validates that line. 00116 bool validateRawSequenceAndPlusLines(); 00117 00118 // Reads and validates the quality string line(s). 00119 bool validateQualityStringLines(); 00120 00121 // Method to validate a line that contains part of the raw sequence. 00122 // offset specifies where in the sequence to start validating. 00123 bool validateRawSequence(int offset); 00124 00125 // Method to validate the "+" line that seperates the raw sequence and the 00126 // quality string. 00127 bool validateSequencePlus(); 00128 00129 // Method to validate the quality string. 00130 // offset specifies where in the quality string to start validating. 00131 bool validateQualityString(int offset); 00132 00133 // Helper method to read a line from the input file into a string. 00134 // It also tracks the line number. 00135 void readLine(); 00136 00137 // Helper method for printing the contents of myErrorString. It will 00138 // only print the errors until the maximum number of reportable errors is 00139 // reached. 00140 void reportErrorOnLine(); 00141 00142 // Reset the member data for each fastq file. 00143 void reset(); 00144 00145 // Reset the member data for each sequence. 00146 void resetForEachSequence(); 00147 00148 // Log the specified message if enabled. 00149 void logMessage(const char* message); 00150 00151 // Determine if it is time to quit by checking if we are to quit after a 00152 // certain number of errors and that many errors have been encountered. 00153 bool isTimeToQuit(); 00154 00156 // Following member data elements are reset for each validated sequence. 00157 // 00158 00159 // Buffer for storing the contents of the line read. 00160 // Stored as member data so memory allocation is only done once. 00161 String myLineBuffer; 00162 00163 // Buffer for storing the error string. This prevents the reallocation of 00164 // the string buffer for each error. 00165 String myErrorString; 00166 00167 String myTempPartialQuality; 00168 00170 // Following member data elements are reset for each validated file. 00171 // 00172 IFILE myFile; // Input file to be read. 00173 String myFileName; // Name of file being processed. 00174 int myNumErrors; // Tracks the number of errors. 00175 uint myLineNum; // Track the line number - used for reporting errors. 00176 BaseComposition myBaseComposition; // Tracks the base composition. 00177 00178 // Map to track which identifiers have appeared in the file. 00179 std::map<std::string, uint> myIdentifierMap; 00180 00182 // Following member data do not change for each call to the validator. 00183 // 00184 int myMinReadLength; // Min Length for a read. 00185 int myNumPrintableErrors; // Max number of errors to print the details of. 00186 00187 // Number of errors after which to quit reading/validating a file. 00188 // Defaults to -1. 00189 // -1 indicates to not quit until the entire file has been read/validated. 00190 // 0 indicates to quit without reading/validating anything. 00191 int myMaxErrors; 00192 00193 00194 00195 // Whether or not messages should be printed. 00196 // Defaulted to false (they should be printed). 00197 bool myDisableMessages; 00198 00199 // Track if there is a problem reading the file. If there are read 00200 // problems, stop reading the file. 00201 bool myFileProblem; 00202 }; 00203 00204 #endif