Difference between revisions of "830 - MINIMAC::main()"

From Genome Analysis Wiki
Jump to navigationJump to search
(Created page with "</source lang="cpp"> int main(int argc, char ** argv) { time_t start = time(NULL); printf("MiniMac - Imputation into phased haplotypes\n" "(c) 2010-2011 Goncalo...")
 
Line 1: Line 1:
</source lang="cpp">
+
<source lang="cpp">
  
 
int main(int argc, char ** argv)
 
int main(int argc, char ** argv)

Revision as of 14:52, 2 October 2013

int main(int argc, char ** argv)
   {
   time_t start = time(NULL);

   printf("MiniMac - Imputation into phased haplotypes\n"
		  "(c) 2010-2011 Goncalo Abecasis, Christian Fuchsberger\n");

   ReadParameters(argc, argv);

   // Read marker list
   printf("Reading Reference Marker List ...\n");

   StringArray refMarkerList;
   refMarkerList.Read(referenceSnps);

   // Index markers
   StringIntHash referenceHash;
   for (int i = 0; i < refMarkerList.Length(); i++)
	  referenceHash.Add(refMarkerList[i].Trim(), i);

   printf("  %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length());

   if (refMarkerList.Length() == 0)
      error("Reference panel marker list is empty - please verify filenames and contents!");

   // Load reference haplotypes
   printf("Loading reference haplotypes ...\n");
   HaplotypeSet reference;

   reference.markerCount = refMarkerList.Length();
   reference.LoadHaplotypes(referenceHaplotypes);

   printf("  %d Reference Haplotypes Loaded ...\n\n", reference.count);

   // Read framework marker list
   printf("Reading Framework Marker List ...\n");
   StringArray markerList;
   markerList.Read(snps);

   if (markerList.Length() == 0)
      error("Framework marker list is empty - please verify filenames and contents!");

   // Crossref Marker Names to Reference Panel Positions
   IntArray markerIndex;
   markerIndex.Dimension(markerList.Length());

   int matches = 0;

   for (int i = 0; i < markerList.Length(); i++)
      {
      markerIndex[i] = referenceHash.Integer(markerList[i].Trim());

      if (markerIndex[i] >= 0) matches++;
      }

   printf("  %d Markers in Framework Haplotypes Overlap Reference ...\n", matches);

   if (matches == 0)
      error("No markers overlap between target and reference\n"
            "Please check correct reference is being used and markers are named consistently");

   printf("  %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches);

   // Check for flips in reference vs. target haplotypes
   int flips = 0;
   int previous = -1;
   for (int i = 0; i < markerIndex.Length(); i++)
      if (markerIndex[i] >= 0)
         if (markerIndex[i] < previous)
            {
            if (flips++ < 10)
               printf("  -> Marker %s precedes %s in reference, but follows it in target\n",
                     (const char *) refMarkerList[previous],
                     (const char *) markerList[i]);
            previous = markerIndex[i];
            }
   if (flips > 10)
      printf("  -> %d Additional Marker Order Changes Not Listed\n", flips - 10);
   if (flips)
      printf("  %d Marker Pairs Change Order in Target vs Framework Haplotypes\n", flips);

   // Load target haplotypes
   printf("Loading target haplotypes ...\n");
   HaplotypeSet target;

   target.markerCount = markerList.Length();
   target.LoadHaplotypes(haplotypes, true);

   reference.CalculateFrequencies();
   target.CalculateFrequencies();
   target.CompareFrequencies(reference, markerIndex, markerList);

   printf("  %d Target Haplotypes Loaded ...\n\n", target.count);

   int startIndex = 0;
   int stopIndex = reference.markerCount - 1;

   printf("Setting up Markov Model...\n\n");

   // Setup Markov Model
   MarkovParameters mp;

   mp.Allocate(reference.markerCount);

   if (rounds > 0)
      printf("Initializing Model Parameters (using E-M and up to %d haplotypes)\n", states);

   // Simple initial estimates of error and recombination rate
   for (int i = 0; i < reference.markerCount; i++)
      mp.E[i] = 0.01;

   for (int i = 0; i < reference.markerCount - 1; i++)
      mp.R[i] = 0.001;

   if (mp.ReadErrorRates(errorRates))
      printf("  Updated error rates using data in %s ...\n", (const char *) errorRates);

   if (mp.ReadCrossoverRates(recombinationRates))
      printf("  Updated recombination rates using %s ...\n", (const char *) recombinationRates);

   // Parameter estimation loop
   for (int round = 0; round < rounds; round++)
      {
      printf("  Round %d of Parameter Refinement ...\n", round + 1);

      int iterations = states < reference.count ? states : reference.count;

      MarkovModel original;
      original.CopyParameters(mp);

      #pragma omp parallel for
      for (int i = 0; i < iterations; i++)
         {
         MarkovModel mm;

         mm.Allocate(reference.markerCount, reference.count - 1);
         mm.CopyParameters(original);

         // Reference leave one out (loo) panel
         char ** reference_loo = new char * [reference.count - 1];
         for (int in = 0, out = 0; in < reference.count; in++)
             if (in != i)
                reference_loo[out++] = reference.haplotypes[in];

         mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq);
         mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq);

         delete [] reference_loo;

         #pragma omp critical
         mp += mm;
         }

      if (round >= rounds / 2)
         {
         int iterations = states < target.count ? states : target.count;

         #pragma omp parallel for
         for (int i = 0; i < iterations; i++)
            {
            MarkovModel mm;

            mm.Allocate(reference.markerCount, reference.count);
            mm.CopyParameters(original);

            // Padded version of target haplotype, including missing sites
            char * padded = new char [reference.markerCount];
            for (int k = 0; k < reference.markerCount; k++)
               padded[k] = 0;

            // Copy current haplotype into padded vector
            for (int j = 0; j < target.markerCount; j++)
               if (markerIndex[j] >= 0)
                  padded[markerIndex[j]] = target.haplotypes[i][j];

            mm.WalkLeft(padded, reference.haplotypes, reference.freq);
            mm.CountExpected(padded, reference.haplotypes, reference.freq);

            delete [] padded;

            #pragma omp critical
            mp += mm;
            }
         }

      mp.UpdateModel();
      }

   if (rounds > 0)
      {
      printf("  Saving estimated parameters for future use ...\n");
      mp.WriteParameters(refMarkerList, prefix, gzip);
      }

   printf("\n");

   // List the major allele at each location
   reference.ListMajorAlleles();

   printf("Generating Draft .info File ...\n\n");

   // Output some basic information
   IFILE info = ifopen(prefix + ".info.draft", "wb");

   ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tGenotyped\n");

   for (int i = 0, j = 0; i <= stopIndex; i++)
      if (i >= startIndex)
         ifprintf(info, "%s\t%s\t%s\t%.4f\t%s\n",
            (const char *) refMarkerList[i],
            reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i),
            reference.freq[reference.major[i]][i],
            j < markerIndex.Length() && i == markerIndex[j] ? (j++, "Genotyped") : "-");
      else
         if (j < markerIndex.Length() && i == markerIndex[j])
            j++;

   ifclose(info);

   printf("Imputing Genotypes ...\n");

   IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wb");
   IFILE probabilities, hapdose, haps;

   if (dosages == NULL)
      error("Error opening output file for storing dosage information.");

   if (phased)
      {
      hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wb");
      haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wb");

      if (hapdose == NULL || haps == NULL)
         error("Error opening output files for storing haplotype information.");
      }


   if (probs)
      {
      probabilities = ifopen(prefix + ".prob" + (gzip ? ".gz" : ""), "wb");

      if (hapdose == NULL || haps == NULL)
         error("Error opening output file for storing genotype probabilities.");
      }

   ImputationStatistics stats(reference.markerCount);

   // Impute each haplotype
   #pragma omp parallel for
   for (int i = 0; i < target.count; i++)
      {
      if (i != 0 && target.labels[i] == target.labels[i-1])
         continue;

      MarkovModel mm;

      mm.Allocate(reference.markerCount, reference.count);
      mm.ClearImputedDose();
      mm.CopyParameters(mp);

      // Padded version of target haplotype, including missing sites
      char * padded = new char [reference.markerCount];
      for (int j = 0; j < reference.markerCount; j++)
         padded[j] = 0;

      int k = i;

      do {
         printf("  Processing Haplotype %d of %d ...\n", k + 1, target.count);

         // Copy current haplotype into padded vector
         for (int j = 0; j < target.markerCount; j++)
            if (markerIndex[j] >= 0)
               padded[markerIndex[j]] = target.haplotypes[k][j];

         mm.WalkLeft(padded, reference.haplotypes, reference.freq);
         mm.Impute(reference.major, padded, reference.haplotypes, reference.freq);

         #pragma omp critical
         { stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); }

         #pragma omp critical
         if (phased)
            {
            ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
            ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
            for (int j = startIndex; j <= stopIndex; j++)
               {
               ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]);
               ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]);
               }
            ifprintf(hapdose, "\n");
            ifprintf(haps, "\n");
            }

         k++;
      } while (k < target.count && target.labels[k] == target.labels[i]);

      printf("    Outputting Individual %s ...\n", (const char *) target.labels[i]);

      #pragma omp critical
         {
         ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]);
         for (int j = startIndex; j <= stopIndex; j++)
            ifprintf(dosages, "\t%.3f", mm.imputedDose[j]);
         ifprintf(dosages, "\n");

         if (probabilities != NULL)
            {
            ifprintf(probabilities, "%s\tPROB", (const char *) target.labels[i]);
            for (int j = startIndex; j <= stopIndex; j++)
               {
               double p1 = mm.imputedDose[j] - mm.imputedHap[j];
               double p2 = mm.imputedHap[j];

               if (p1 < 0.) p1 = 0.;

               double pHom = p1 * p2;
               double pHet = p1 * (1. - p2) + (1. - p1) * p2;

               ifprintf(probabilities, "%\t%.3f\t%.3f", pHom, pHet);
               }
            ifprintf(probabilities, "\n");
            }
         }

      delete [] padded;
      }

   ifclose(dosages);

   if (phased)
      {
      ifclose(hapdose);
      ifclose(haps);
      }

   if (probabilities != NULL)
      ifclose(probabilities);

   time_t stop = time(NULL);
   int seconds = stop - start;

   printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n",
          seconds / 3600, (seconds % 3600) / 60, seconds % 60,
          ctime(&stop));
   }