Changes

From Genome Analysis Wiki
Jump to navigationJump to search
11,013 bytes added ,  14:52, 2 October 2013
Created page with "</source lang="cpp"> int main(int argc, char ** argv) { time_t start = time(NULL); printf("MiniMac - Imputation into phased haplotypes\n" "(c) 2010-2011 Goncalo..."
</source lang="cpp">

int main(int argc, char ** argv)
{
time_t start = time(NULL);

printf("MiniMac - Imputation into phased haplotypes\n"
"(c) 2010-2011 Goncalo Abecasis, Christian Fuchsberger\n");

ReadParameters(argc, argv);

// Read marker list
printf("Reading Reference Marker List ...\n");

StringArray refMarkerList;
refMarkerList.Read(referenceSnps);

// Index markers
StringIntHash referenceHash;
for (int i = 0; i < refMarkerList.Length(); i++)
referenceHash.Add(refMarkerList[i].Trim(), i);

printf(" %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length());

if (refMarkerList.Length() == 0)
error("Reference panel marker list is empty - please verify filenames and contents!");

// Load reference haplotypes
printf("Loading reference haplotypes ...\n");
HaplotypeSet reference;

reference.markerCount = refMarkerList.Length();
reference.LoadHaplotypes(referenceHaplotypes);

printf(" %d Reference Haplotypes Loaded ...\n\n", reference.count);

// Read framework marker list
printf("Reading Framework Marker List ...\n");
StringArray markerList;
markerList.Read(snps);

if (markerList.Length() == 0)
error("Framework marker list is empty - please verify filenames and contents!");

// Crossref Marker Names to Reference Panel Positions
IntArray markerIndex;
markerIndex.Dimension(markerList.Length());

int matches = 0;

for (int i = 0; i < markerList.Length(); i++)
{
markerIndex[i] = referenceHash.Integer(markerList[i].Trim());

if (markerIndex[i] >= 0) matches++;
}

printf(" %d Markers in Framework Haplotypes Overlap Reference ...\n", matches);

if (matches == 0)
error("No markers overlap between target and reference\n"
"Please check correct reference is being used and markers are named consistently");

printf(" %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches);

// Check for flips in reference vs. target haplotypes
int flips = 0;
int previous = -1;
for (int i = 0; i < markerIndex.Length(); i++)
if (markerIndex[i] >= 0)
if (markerIndex[i] < previous)
{
if (flips++ < 10)
printf(" -> Marker %s precedes %s in reference, but follows it in target\n",
(const char *) refMarkerList[previous],
(const char *) markerList[i]);
previous = markerIndex[i];
}
if (flips > 10)
printf(" -> %d Additional Marker Order Changes Not Listed\n", flips - 10);
if (flips)
printf(" %d Marker Pairs Change Order in Target vs Framework Haplotypes\n", flips);

// Load target haplotypes
printf("Loading target haplotypes ...\n");
HaplotypeSet target;

target.markerCount = markerList.Length();
target.LoadHaplotypes(haplotypes, true);

reference.CalculateFrequencies();
target.CalculateFrequencies();
target.CompareFrequencies(reference, markerIndex, markerList);

printf(" %d Target Haplotypes Loaded ...\n\n", target.count);

int startIndex = 0;
int stopIndex = reference.markerCount - 1;

printf("Setting up Markov Model...\n\n");

// Setup Markov Model
MarkovParameters mp;

mp.Allocate(reference.markerCount);

if (rounds > 0)
printf("Initializing Model Parameters (using E-M and up to %d haplotypes)\n", states);

// Simple initial estimates of error and recombination rate
for (int i = 0; i < reference.markerCount; i++)
mp.E[i] = 0.01;

for (int i = 0; i < reference.markerCount - 1; i++)
mp.R[i] = 0.001;

if (mp.ReadErrorRates(errorRates))
printf(" Updated error rates using data in %s ...\n", (const char *) errorRates);

if (mp.ReadCrossoverRates(recombinationRates))
printf(" Updated recombination rates using %s ...\n", (const char *) recombinationRates);

// Parameter estimation loop
for (int round = 0; round < rounds; round++)
{
printf(" Round %d of Parameter Refinement ...\n", round + 1);

int iterations = states < reference.count ? states : reference.count;

MarkovModel original;
original.CopyParameters(mp);

#pragma omp parallel for
for (int i = 0; i < iterations; i++)
{
MarkovModel mm;

mm.Allocate(reference.markerCount, reference.count - 1);
mm.CopyParameters(original);

// Reference leave one out (loo) panel
char ** reference_loo = new char * [reference.count - 1];
for (int in = 0, out = 0; in < reference.count; in++)
if (in != i)
reference_loo[out++] = reference.haplotypes[in];

mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq);
mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq);

delete [] reference_loo;

#pragma omp critical
mp += mm;
}

if (round >= rounds / 2)
{
int iterations = states < target.count ? states : target.count;

#pragma omp parallel for
for (int i = 0; i < iterations; i++)
{
MarkovModel mm;

mm.Allocate(reference.markerCount, reference.count);
mm.CopyParameters(original);

// Padded version of target haplotype, including missing sites
char * padded = new char [reference.markerCount];
for (int k = 0; k < reference.markerCount; k++)
padded[k] = 0;

// Copy current haplotype into padded vector
for (int j = 0; j < target.markerCount; j++)
if (markerIndex[j] >= 0)
padded[markerIndex[j]] = target.haplotypes[i][j];

mm.WalkLeft(padded, reference.haplotypes, reference.freq);
mm.CountExpected(padded, reference.haplotypes, reference.freq);

delete [] padded;

#pragma omp critical
mp += mm;
}
}

mp.UpdateModel();
}

if (rounds > 0)
{
printf(" Saving estimated parameters for future use ...\n");
mp.WriteParameters(refMarkerList, prefix, gzip);
}

printf("\n");

// List the major allele at each location
reference.ListMajorAlleles();

printf("Generating Draft .info File ...\n\n");

// Output some basic information
IFILE info = ifopen(prefix + ".info.draft", "wb");

ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tGenotyped\n");

for (int i = 0, j = 0; i <= stopIndex; i++)
if (i >= startIndex)
ifprintf(info, "%s\t%s\t%s\t%.4f\t%s\n",
(const char *) refMarkerList[i],
reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i),
reference.freq[reference.major[i]][i],
j < markerIndex.Length() && i == markerIndex[j] ? (j++, "Genotyped") : "-");
else
if (j < markerIndex.Length() && i == markerIndex[j])
j++;

ifclose(info);

printf("Imputing Genotypes ...\n");

IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wb");
IFILE probabilities, hapdose, haps;

if (dosages == NULL)
error("Error opening output file for storing dosage information.");

if (phased)
{
hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wb");
haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wb");

if (hapdose == NULL || haps == NULL)
error("Error opening output files for storing haplotype information.");
}


if (probs)
{
probabilities = ifopen(prefix + ".prob" + (gzip ? ".gz" : ""), "wb");

if (hapdose == NULL || haps == NULL)
error("Error opening output file for storing genotype probabilities.");
}

ImputationStatistics stats(reference.markerCount);

// Impute each haplotype
#pragma omp parallel for
for (int i = 0; i < target.count; i++)
{
if (i != 0 && target.labels[i] == target.labels[i-1])
continue;

MarkovModel mm;

mm.Allocate(reference.markerCount, reference.count);
mm.ClearImputedDose();
mm.CopyParameters(mp);

// Padded version of target haplotype, including missing sites
char * padded = new char [reference.markerCount];
for (int j = 0; j < reference.markerCount; j++)
padded[j] = 0;

int k = i;

do {
printf(" Processing Haplotype %d of %d ...\n", k + 1, target.count);

// Copy current haplotype into padded vector
for (int j = 0; j < target.markerCount; j++)
if (markerIndex[j] >= 0)
padded[markerIndex[j]] = target.haplotypes[k][j];

mm.WalkLeft(padded, reference.haplotypes, reference.freq);
mm.Impute(reference.major, padded, reference.haplotypes, reference.freq);

#pragma omp critical
{ stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); }

#pragma omp critical
if (phased)
{
ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
for (int j = startIndex; j <= stopIndex; j++)
{
ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]);
ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]);
}
ifprintf(hapdose, "\n");
ifprintf(haps, "\n");
}

k++;
} while (k < target.count && target.labels[k] == target.labels[i]);

printf(" Outputting Individual %s ...\n", (const char *) target.labels[i]);

#pragma omp critical
{
ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]);
for (int j = startIndex; j <= stopIndex; j++)
ifprintf(dosages, "\t%.3f", mm.imputedDose[j]);
ifprintf(dosages, "\n");

if (probabilities != NULL)
{
ifprintf(probabilities, "%s\tPROB", (const char *) target.labels[i]);
for (int j = startIndex; j <= stopIndex; j++)
{
double p1 = mm.imputedDose[j] - mm.imputedHap[j];
double p2 = mm.imputedHap[j];

if (p1 < 0.) p1 = 0.;

double pHom = p1 * p2;
double pHet = p1 * (1. - p2) + (1. - p1) * p2;

ifprintf(probabilities, "%\t%.3f\t%.3f", pHom, pHet);
}
ifprintf(probabilities, "\n");
}
}

delete [] padded;
}

ifclose(dosages);

if (phased)
{
ifclose(hapdose);
ifclose(haps);
}

if (probabilities != NULL)
ifclose(probabilities);

time_t stop = time(NULL);
int seconds = stop - start;

printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n",
seconds / 3600, (seconds % 3600) / 60, seconds % 60,
ctime(&stop));
}

</source>

Navigation menu