830 - MINIMAC::main()
From Genome Analysis Wiki
Jump to navigationJump to search
</source lang="cpp">
int main(int argc, char ** argv)
{ time_t start = time(NULL);
printf("MiniMac - Imputation into phased haplotypes\n"
"(c) 2010-2011 Goncalo Abecasis, Christian Fuchsberger\n");
ReadParameters(argc, argv);
// Read marker list printf("Reading Reference Marker List ...\n");
StringArray refMarkerList; refMarkerList.Read(referenceSnps);
// Index markers StringIntHash referenceHash; for (int i = 0; i < refMarkerList.Length(); i++)
referenceHash.Add(refMarkerList[i].Trim(), i);
printf(" %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length());
if (refMarkerList.Length() == 0) error("Reference panel marker list is empty - please verify filenames and contents!");
// Load reference haplotypes printf("Loading reference haplotypes ...\n"); HaplotypeSet reference;
reference.markerCount = refMarkerList.Length(); reference.LoadHaplotypes(referenceHaplotypes);
printf(" %d Reference Haplotypes Loaded ...\n\n", reference.count);
// Read framework marker list printf("Reading Framework Marker List ...\n"); StringArray markerList; markerList.Read(snps);
if (markerList.Length() == 0) error("Framework marker list is empty - please verify filenames and contents!");
// Crossref Marker Names to Reference Panel Positions IntArray markerIndex; markerIndex.Dimension(markerList.Length());
int matches = 0;
for (int i = 0; i < markerList.Length(); i++) { markerIndex[i] = referenceHash.Integer(markerList[i].Trim());
if (markerIndex[i] >= 0) matches++; }
printf(" %d Markers in Framework Haplotypes Overlap Reference ...\n", matches);
if (matches == 0) error("No markers overlap between target and reference\n" "Please check correct reference is being used and markers are named consistently");
printf(" %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches);
// Check for flips in reference vs. target haplotypes int flips = 0; int previous = -1; for (int i = 0; i < markerIndex.Length(); i++) if (markerIndex[i] >= 0) if (markerIndex[i] < previous) { if (flips++ < 10) printf(" -> Marker %s precedes %s in reference, but follows it in target\n", (const char *) refMarkerList[previous], (const char *) markerList[i]); previous = markerIndex[i]; } if (flips > 10) printf(" -> %d Additional Marker Order Changes Not Listed\n", flips - 10); if (flips) printf(" %d Marker Pairs Change Order in Target vs Framework Haplotypes\n", flips);
// Load target haplotypes printf("Loading target haplotypes ...\n"); HaplotypeSet target;
target.markerCount = markerList.Length(); target.LoadHaplotypes(haplotypes, true);
reference.CalculateFrequencies(); target.CalculateFrequencies(); target.CompareFrequencies(reference, markerIndex, markerList);
printf(" %d Target Haplotypes Loaded ...\n\n", target.count);
int startIndex = 0; int stopIndex = reference.markerCount - 1;
printf("Setting up Markov Model...\n\n");
// Setup Markov Model MarkovParameters mp;
mp.Allocate(reference.markerCount);
if (rounds > 0) printf("Initializing Model Parameters (using E-M and up to %d haplotypes)\n", states);
// Simple initial estimates of error and recombination rate for (int i = 0; i < reference.markerCount; i++) mp.E[i] = 0.01;
for (int i = 0; i < reference.markerCount - 1; i++) mp.R[i] = 0.001;
if (mp.ReadErrorRates(errorRates)) printf(" Updated error rates using data in %s ...\n", (const char *) errorRates);
if (mp.ReadCrossoverRates(recombinationRates)) printf(" Updated recombination rates using %s ...\n", (const char *) recombinationRates);
// Parameter estimation loop for (int round = 0; round < rounds; round++) { printf(" Round %d of Parameter Refinement ...\n", round + 1);
int iterations = states < reference.count ? states : reference.count;
MarkovModel original; original.CopyParameters(mp);
#pragma omp parallel for for (int i = 0; i < iterations; i++) { MarkovModel mm;
mm.Allocate(reference.markerCount, reference.count - 1); mm.CopyParameters(original);
// Reference leave one out (loo) panel char ** reference_loo = new char * [reference.count - 1]; for (int in = 0, out = 0; in < reference.count; in++) if (in != i) reference_loo[out++] = reference.haplotypes[in];
mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq); mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq);
delete [] reference_loo;
#pragma omp critical mp += mm; }
if (round >= rounds / 2) { int iterations = states < target.count ? states : target.count;
#pragma omp parallel for for (int i = 0; i < iterations; i++) { MarkovModel mm;
mm.Allocate(reference.markerCount, reference.count); mm.CopyParameters(original);
// Padded version of target haplotype, including missing sites char * padded = new char [reference.markerCount]; for (int k = 0; k < reference.markerCount; k++) padded[k] = 0;
// Copy current haplotype into padded vector for (int j = 0; j < target.markerCount; j++) if (markerIndex[j] >= 0) padded[markerIndex[j]] = target.haplotypes[i][j];
mm.WalkLeft(padded, reference.haplotypes, reference.freq); mm.CountExpected(padded, reference.haplotypes, reference.freq);
delete [] padded;
#pragma omp critical mp += mm; } }
mp.UpdateModel(); }
if (rounds > 0) { printf(" Saving estimated parameters for future use ...\n"); mp.WriteParameters(refMarkerList, prefix, gzip); }
printf("\n");
// List the major allele at each location reference.ListMajorAlleles();
printf("Generating Draft .info File ...\n\n");
// Output some basic information IFILE info = ifopen(prefix + ".info.draft", "wb");
ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tGenotyped\n");
for (int i = 0, j = 0; i <= stopIndex; i++) if (i >= startIndex) ifprintf(info, "%s\t%s\t%s\t%.4f\t%s\n", (const char *) refMarkerList[i], reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i), reference.freq[reference.major[i]][i], j < markerIndex.Length() && i == markerIndex[j] ? (j++, "Genotyped") : "-"); else if (j < markerIndex.Length() && i == markerIndex[j]) j++;
ifclose(info);
printf("Imputing Genotypes ...\n");
IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wb"); IFILE probabilities, hapdose, haps;
if (dosages == NULL) error("Error opening output file for storing dosage information.");
if (phased) { hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wb"); haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wb");
if (hapdose == NULL || haps == NULL) error("Error opening output files for storing haplotype information."); }
if (probs) { probabilities = ifopen(prefix + ".prob" + (gzip ? ".gz" : ""), "wb");
if (hapdose == NULL || haps == NULL) error("Error opening output file for storing genotype probabilities."); }
ImputationStatistics stats(reference.markerCount);
// Impute each haplotype #pragma omp parallel for for (int i = 0; i < target.count; i++) { if (i != 0 && target.labels[i] == target.labels[i-1]) continue;
MarkovModel mm;
mm.Allocate(reference.markerCount, reference.count); mm.ClearImputedDose(); mm.CopyParameters(mp);
// Padded version of target haplotype, including missing sites char * padded = new char [reference.markerCount]; for (int j = 0; j < reference.markerCount; j++) padded[j] = 0;
int k = i;
do { printf(" Processing Haplotype %d of %d ...\n", k + 1, target.count);
// Copy current haplotype into padded vector for (int j = 0; j < target.markerCount; j++) if (markerIndex[j] >= 0) padded[markerIndex[j]] = target.haplotypes[k][j];
mm.WalkLeft(padded, reference.haplotypes, reference.freq); mm.Impute(reference.major, padded, reference.haplotypes, reference.freq);
#pragma omp critical { stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); }
#pragma omp critical if (phased) { ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1); ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1); for (int j = startIndex; j <= stopIndex; j++) { ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]); ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]); } ifprintf(hapdose, "\n"); ifprintf(haps, "\n"); }
k++; } while (k < target.count && target.labels[k] == target.labels[i]);
printf(" Outputting Individual %s ...\n", (const char *) target.labels[i]);
#pragma omp critical { ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]); for (int j = startIndex; j <= stopIndex; j++) ifprintf(dosages, "\t%.3f", mm.imputedDose[j]); ifprintf(dosages, "\n");
if (probabilities != NULL) { ifprintf(probabilities, "%s\tPROB", (const char *) target.labels[i]); for (int j = startIndex; j <= stopIndex; j++) { double p1 = mm.imputedDose[j] - mm.imputedHap[j]; double p2 = mm.imputedHap[j];
if (p1 < 0.) p1 = 0.;
double pHom = p1 * p2; double pHet = p1 * (1. - p2) + (1. - p1) * p2;
ifprintf(probabilities, "%\t%.3f\t%.3f", pHom, pHet); } ifprintf(probabilities, "\n"); } }
delete [] padded; }
ifclose(dosages);
if (phased) { ifclose(hapdose); ifclose(haps); }
if (probabilities != NULL) ifclose(probabilities);
time_t stop = time(NULL); int seconds = stop - start;
printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n", seconds / 3600, (seconds % 3600) / 60, seconds % 60, ctime(&stop)); }
</source>