830 - MINIMAC::main()

From Genome Analysis Wiki
Revision as of 14:52, 2 October 2013 by Goncalo (talk | contribs) (Created page with "</source lang="cpp"> int main(int argc, char ** argv) { time_t start = time(NULL); printf("MiniMac - Imputation into phased haplotypes\n" "(c) 2010-2011 Goncalo...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigationJump to search

</source lang="cpp">

int main(int argc, char ** argv)

  {
  time_t start = time(NULL);
  printf("MiniMac - Imputation into phased haplotypes\n"

"(c) 2010-2011 Goncalo Abecasis, Christian Fuchsberger\n");

  ReadParameters(argc, argv);
  // Read marker list
  printf("Reading Reference Marker List ...\n");
  StringArray refMarkerList;
  refMarkerList.Read(referenceSnps);
  // Index markers
  StringIntHash referenceHash;
  for (int i = 0; i < refMarkerList.Length(); i++)

referenceHash.Add(refMarkerList[i].Trim(), i);

  printf("  %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length());
  if (refMarkerList.Length() == 0)
     error("Reference panel marker list is empty - please verify filenames and contents!");
  // Load reference haplotypes
  printf("Loading reference haplotypes ...\n");
  HaplotypeSet reference;
  reference.markerCount = refMarkerList.Length();
  reference.LoadHaplotypes(referenceHaplotypes);
  printf("  %d Reference Haplotypes Loaded ...\n\n", reference.count);
  // Read framework marker list
  printf("Reading Framework Marker List ...\n");
  StringArray markerList;
  markerList.Read(snps);
  if (markerList.Length() == 0)
     error("Framework marker list is empty - please verify filenames and contents!");
  // Crossref Marker Names to Reference Panel Positions
  IntArray markerIndex;
  markerIndex.Dimension(markerList.Length());
  int matches = 0;
  for (int i = 0; i < markerList.Length(); i++)
     {
     markerIndex[i] = referenceHash.Integer(markerList[i].Trim());
     if (markerIndex[i] >= 0) matches++;
     }
  printf("  %d Markers in Framework Haplotypes Overlap Reference ...\n", matches);
  if (matches == 0)
     error("No markers overlap between target and reference\n"
           "Please check correct reference is being used and markers are named consistently");
  printf("  %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches);
  // Check for flips in reference vs. target haplotypes
  int flips = 0;
  int previous = -1;
  for (int i = 0; i < markerIndex.Length(); i++)
     if (markerIndex[i] >= 0)
        if (markerIndex[i] < previous)
           {
           if (flips++ < 10)
              printf("  -> Marker %s precedes %s in reference, but follows it in target\n",
                    (const char *) refMarkerList[previous],
                    (const char *) markerList[i]);
           previous = markerIndex[i];
           }
  if (flips > 10)
     printf("  -> %d Additional Marker Order Changes Not Listed\n", flips - 10);
  if (flips)
     printf("  %d Marker Pairs Change Order in Target vs Framework Haplotypes\n", flips);
  // Load target haplotypes
  printf("Loading target haplotypes ...\n");
  HaplotypeSet target;
  target.markerCount = markerList.Length();
  target.LoadHaplotypes(haplotypes, true);
  reference.CalculateFrequencies();
  target.CalculateFrequencies();
  target.CompareFrequencies(reference, markerIndex, markerList);
  printf("  %d Target Haplotypes Loaded ...\n\n", target.count);
  int startIndex = 0;
  int stopIndex = reference.markerCount - 1;
  printf("Setting up Markov Model...\n\n");
  // Setup Markov Model
  MarkovParameters mp;
  mp.Allocate(reference.markerCount);
  if (rounds > 0)
     printf("Initializing Model Parameters (using E-M and up to %d haplotypes)\n", states);
  // Simple initial estimates of error and recombination rate
  for (int i = 0; i < reference.markerCount; i++)
     mp.E[i] = 0.01;
  for (int i = 0; i < reference.markerCount - 1; i++)
     mp.R[i] = 0.001;
  if (mp.ReadErrorRates(errorRates))
     printf("  Updated error rates using data in %s ...\n", (const char *) errorRates);
  if (mp.ReadCrossoverRates(recombinationRates))
     printf("  Updated recombination rates using %s ...\n", (const char *) recombinationRates);
  // Parameter estimation loop
  for (int round = 0; round < rounds; round++)
     {
     printf("  Round %d of Parameter Refinement ...\n", round + 1);
     int iterations = states < reference.count ? states : reference.count;
     MarkovModel original;
     original.CopyParameters(mp);
     #pragma omp parallel for
     for (int i = 0; i < iterations; i++)
        {
        MarkovModel mm;
        mm.Allocate(reference.markerCount, reference.count - 1);
        mm.CopyParameters(original);
        // Reference leave one out (loo) panel
        char ** reference_loo = new char * [reference.count - 1];
        for (int in = 0, out = 0; in < reference.count; in++)
            if (in != i)
               reference_loo[out++] = reference.haplotypes[in];
        mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq);
        mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq);
        delete [] reference_loo;
        #pragma omp critical
        mp += mm;
        }
     if (round >= rounds / 2)
        {
        int iterations = states < target.count ? states : target.count;
        #pragma omp parallel for
        for (int i = 0; i < iterations; i++)
           {
           MarkovModel mm;
           mm.Allocate(reference.markerCount, reference.count);
           mm.CopyParameters(original);
           // Padded version of target haplotype, including missing sites
           char * padded = new char [reference.markerCount];
           for (int k = 0; k < reference.markerCount; k++)
              padded[k] = 0;
           // Copy current haplotype into padded vector
           for (int j = 0; j < target.markerCount; j++)
              if (markerIndex[j] >= 0)
                 padded[markerIndex[j]] = target.haplotypes[i][j];
           mm.WalkLeft(padded, reference.haplotypes, reference.freq);
           mm.CountExpected(padded, reference.haplotypes, reference.freq);
           delete [] padded;
           #pragma omp critical
           mp += mm;
           }
        }
     mp.UpdateModel();
     }
  if (rounds > 0)
     {
     printf("  Saving estimated parameters for future use ...\n");
     mp.WriteParameters(refMarkerList, prefix, gzip);
     }
  printf("\n");
  // List the major allele at each location
  reference.ListMajorAlleles();
  printf("Generating Draft .info File ...\n\n");
  // Output some basic information
  IFILE info = ifopen(prefix + ".info.draft", "wb");
  ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tGenotyped\n");
  for (int i = 0, j = 0; i <= stopIndex; i++)
     if (i >= startIndex)
        ifprintf(info, "%s\t%s\t%s\t%.4f\t%s\n",
           (const char *) refMarkerList[i],
           reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i),
           reference.freq[reference.major[i]][i],
           j < markerIndex.Length() && i == markerIndex[j] ? (j++, "Genotyped") : "-");
     else
        if (j < markerIndex.Length() && i == markerIndex[j])
           j++;
  ifclose(info);
  printf("Imputing Genotypes ...\n");
  IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wb");
  IFILE probabilities, hapdose, haps;
  if (dosages == NULL)
     error("Error opening output file for storing dosage information.");
  if (phased)
     {
     hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wb");
     haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wb");
     if (hapdose == NULL || haps == NULL)
        error("Error opening output files for storing haplotype information.");
     }


  if (probs)
     {
     probabilities = ifopen(prefix + ".prob" + (gzip ? ".gz" : ""), "wb");
     if (hapdose == NULL || haps == NULL)
        error("Error opening output file for storing genotype probabilities.");
     }
  ImputationStatistics stats(reference.markerCount);
  // Impute each haplotype
  #pragma omp parallel for
  for (int i = 0; i < target.count; i++)
     {
     if (i != 0 && target.labels[i] == target.labels[i-1])
        continue;
     MarkovModel mm;
     mm.Allocate(reference.markerCount, reference.count);
     mm.ClearImputedDose();
     mm.CopyParameters(mp);
     // Padded version of target haplotype, including missing sites
     char * padded = new char [reference.markerCount];
     for (int j = 0; j < reference.markerCount; j++)
        padded[j] = 0;
     int k = i;
     do {
        printf("  Processing Haplotype %d of %d ...\n", k + 1, target.count);
        // Copy current haplotype into padded vector
        for (int j = 0; j < target.markerCount; j++)
           if (markerIndex[j] >= 0)
              padded[markerIndex[j]] = target.haplotypes[k][j];
        mm.WalkLeft(padded, reference.haplotypes, reference.freq);
        mm.Impute(reference.major, padded, reference.haplotypes, reference.freq);
        #pragma omp critical
        { stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); }
        #pragma omp critical
        if (phased)
           {
           ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
           ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
           for (int j = startIndex; j <= stopIndex; j++)
              {
              ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]);
              ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]);
              }
           ifprintf(hapdose, "\n");
           ifprintf(haps, "\n");
           }
        k++;
     } while (k < target.count && target.labels[k] == target.labels[i]);
     printf("    Outputting Individual %s ...\n", (const char *) target.labels[i]);
     #pragma omp critical
        {
        ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]);
        for (int j = startIndex; j <= stopIndex; j++)
           ifprintf(dosages, "\t%.3f", mm.imputedDose[j]);
        ifprintf(dosages, "\n");
        if (probabilities != NULL)
           {
           ifprintf(probabilities, "%s\tPROB", (const char *) target.labels[i]);
           for (int j = startIndex; j <= stopIndex; j++)
              {
              double p1 = mm.imputedDose[j] - mm.imputedHap[j];
              double p2 = mm.imputedHap[j];
              if (p1 < 0.) p1 = 0.;
              double pHom = p1 * p2;
              double pHet = p1 * (1. - p2) + (1. - p1) * p2;
              ifprintf(probabilities, "%\t%.3f\t%.3f", pHom, pHet);
              }
           ifprintf(probabilities, "\n");
           }
        }
     delete [] padded;
     }
  ifclose(dosages);
  if (phased)
     {
     ifclose(hapdose);
     ifclose(haps);
     }
  if (probabilities != NULL)
     ifclose(probabilities);
  time_t stop = time(NULL);
  int seconds = stop - start;
  printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n",
         seconds / 3600, (seconds % 3600) / 60, seconds % 60,
         ctime(&stop));
  }

</source>