830 - MINIMAC::main()
From Genome Analysis Wiki
Jump to navigationJump to searchint main(int argc, char ** argv)
{
time_t start = time(NULL);
printf("MiniMac - Imputation into phased haplotypes\n"
"(c) 2010-2011 Goncalo Abecasis, Christian Fuchsberger\n");
ReadParameters(argc, argv);
// =======================================================================================
// Read marker list ======================================================================
printf("Reading Reference Marker List ...\n");
StringArray refMarkerList;
refMarkerList.Read(referenceSnps);
// Index markers
StringIntHash referenceHash;
for (int i = 0; i < refMarkerList.Length(); i++)
referenceHash.Add(refMarkerList[i].Trim(), i);
printf(" %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length());
if (refMarkerList.Length() == 0)
error("Reference panel marker list is empty - please verify filenames and contents!");
// =======================================================================================
// Load reference haplotypes =============================================================
printf("Loading reference haplotypes ...\n");
HaplotypeSet reference;
reference.markerCount = refMarkerList.Length();
reference.LoadHaplotypes(referenceHaplotypes);
printf(" %d Reference Haplotypes Loaded ...\n\n", reference.count);
// Read framework marker list
printf("Reading Framework Marker List ...\n");
StringArray markerList;
markerList.Read(snps);
if (markerList.Length() == 0)
error("Framework marker list is empty - please verify filenames and contents!");
// =======================================================================================
// Crossref Marker Names to Reference Panel Positions ====================================
IntArray markerIndex;
markerIndex.Dimension(markerList.Length());
int matches = 0;
for (int i = 0; i < markerList.Length(); i++)
{
markerIndex[i] = referenceHash.Integer(markerList[i].Trim());
if (markerIndex[i] >= 0) matches++;
}
printf(" %d Markers in Framework Haplotypes Overlap Reference ...\n", matches);
if (matches == 0)
error("No markers overlap between target and reference\n"
"Please check correct reference is being used and markers are named consistently");
printf(" %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches);
// =======================================================================================
// Check for flips in reference vs. target haplotypes discarded for simplicity ===========
// =======================================================================================
// Load target haplotypes ================================================================
printf("Loading target haplotypes ...\n");
HaplotypeSet target;
target.markerCount = markerList.Length();
target.LoadHaplotypes(haplotypes, true);
reference.CalculateFrequencies();
target.CalculateFrequencies();
target.CompareFrequencies(reference, markerIndex, markerList);
printf(" %d Target Haplotypes Loaded ...\n\n", target.count);
// =======================================================================================
// Setup basic model parameters ==========================================================
int startIndex = 0;
int stopIndex = reference.markerCount - 1;
printf("Setting up Markov Model...\n\n");
// Setup Markov Model
MarkovParameters mp;
mp.Allocate(reference.markerCount);
// Simple initial estimates of error and recombination rate
for (int i = 0; i < reference.markerCount; i++)
mp.E[i] = 0.01;
for (int i = 0; i < reference.markerCount - 1; i++)
mp.R[i] = 0.001;
if (mp.ReadErrorRates(errorRates))
printf(" Updated error rates using data in %s ...\n", (const char *) errorRates);
if (mp.ReadCrossoverRates(recombinationRates))
printf(" Updated recombination rates using %s ...\n", (const char *) recombinationRates);
if (rounds > 0)
printf("Initializing Model Parameters (using E-M and up to %d haplotypes)\n", states);
// =======================================================================================
// Parameter estimation loop =============================================================
for (int round = 0; round < rounds; round++)
{
printf(" Round %d of Parameter Refinement ...\n", round + 1);
int iterations = states < reference.count ? states : reference.count;
MarkovModel original;
original.CopyParameters(mp);
// =======================================================================================
// We can update parameters by comparing a reference haplotype to the others =============
#pragma omp parallel for
for (int i = 0; i < iterations; i++)
{
MarkovModel mm;
mm.Allocate(reference.markerCount, reference.count - 1);
mm.CopyParameters(original);
// Reference leave one out (loo) panel
char ** reference_loo = new char * [reference.count - 1];
for (int in = 0, out = 0; in < reference.count; in++)
if (in != i)
reference_loo[out++] = reference.haplotypes[in];
mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq);
mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq);
delete [] reference_loo;
#pragma omp critical
mp += mm;
}
// =======================================================================================
// Or, by comparing a sampled haplotype to the reference =================================
if (round >= rounds / 2)
{
int iterations = states < target.count ? states : target.count;
#pragma omp parallel for
for (int i = 0; i < iterations; i++)
{
MarkovModel mm;
mm.Allocate(reference.markerCount, reference.count);
mm.CopyParameters(original);
// Padded version of target haplotype, including missing sites
char * padded = new char [reference.markerCount];
for (int k = 0; k < reference.markerCount; k++)
padded[k] = 0;
// Copy current haplotype into padded vector
for (int j = 0; j < target.markerCount; j++)
if (markerIndex[j] >= 0)
padded[markerIndex[j]] = target.haplotypes[i][j];
mm.WalkLeft(padded, reference.haplotypes, reference.freq);
mm.CountExpected(padded, reference.haplotypes, reference.freq);
delete [] padded;
#pragma omp critical
mp += mm;
}
}
// =======================================================================================
// At the end of each iteration, we need to update the model =============================
mp.UpdateModel();
}
if (rounds > 0)
{
printf(" Saving estimated parameters for future use ...\n");
mp.WriteParameters(refMarkerList, prefix, gzip);
}
printf("\n");
// List the major allele at each location
reference.ListMajorAlleles();
// =======================================================================================
// Now, we can do some imputation - but first open output files! =========================
printf("Imputing Genotypes ...\n");
IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wb");
IFILE probabilities, hapdose, haps;
if (dosages == NULL)
error("Error opening output file for storing dosage information.");
if (phased)
{
hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wb");
haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wb");
if (hapdose == NULL || haps == NULL)
error("Error opening output files for storing haplotype information.");
}
if (probs)
{
probabilities = ifopen(prefix + ".prob" + (gzip ? ".gz" : ""), "wb");
if (hapdose == NULL || haps == NULL)
error("Error opening output file for storing genotype probabilities.");
}
ImputationStatistics stats(reference.markerCount);
// =======================================================================================
// Now, loop through haplotypes, imputing one at a time ==================================
#pragma omp parallel for
for (int i = 0; i < target.count; i++)
{
if (i != 0 && target.labels[i] == target.labels[i-1])
continue;
MarkovModel mm;
mm.Allocate(reference.markerCount, reference.count);
mm.ClearImputedDose();
mm.CopyParameters(mp);
// Padded version of target haplotype, including missing sites
char * padded = new char [reference.markerCount];
for (int j = 0; j < reference.markerCount; j++)
padded[j] = 0;
int k = i;
do {
printf(" Processing Haplotype %d of %d ...\n", k + 1, target.count);
// Copy current haplotype into padded vector
for (int j = 0; j < target.markerCount; j++)
if (markerIndex[j] >= 0)
padded[markerIndex[j]] = target.haplotypes[k][j];
mm.WalkLeft(padded, reference.haplotypes, reference.freq);
mm.Impute(reference.major, padded, reference.haplotypes, reference.freq);
#pragma omp critical
{ stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); }
#pragma omp critical
if (phased)
{
ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
for (int j = startIndex; j <= stopIndex; j++)
{
ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]);
ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]);
}
ifprintf(hapdose, "\n");
ifprintf(haps, "\n");
}
k++;
} while (k < target.count && target.labels[k] == target.labels[i]);
printf(" Outputting Individual %s ...\n", (const char *) target.labels[i]);
#pragma omp critical
{
ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]);
for (int j = startIndex; j <= stopIndex; j++)
ifprintf(dosages, "\t%.3f", mm.imputedDose[j]);
ifprintf(dosages, "\n");
if (probabilities != NULL)
{
ifprintf(probabilities, "%s\tPROB", (const char *) target.labels[i]);
for (int j = startIndex; j <= stopIndex; j++)
{
double p1 = mm.imputedDose[j] - mm.imputedHap[j];
double p2 = mm.imputedHap[j];
if (p1 < 0.) p1 = 0.;
double pHom = p1 * p2;
double pHet = p1 * (1. - p2) + (1. - p1) * p2;
ifprintf(probabilities, "%\t%.3f\t%.3f", pHom, pHet);
}
ifprintf(probabilities, "\n");
}
}
delete [] padded;
}
// ============================================================================
// Close output files =========================================================
ifclose(dosages);
if (phased)
{
ifclose(hapdose);
ifclose(haps);
}
if (probabilities != NULL)
ifclose(probabilities);
// ===========================================================================
// Report runtime statistics =================================================
time_t stop = time(NULL);
int seconds = stop - start;
printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n",
seconds / 3600, (seconds % 3600) / 60, seconds % 60,
ctime(&stop));
}