broadinstitute · mcovarr · Jul 15, 2025 · Jul 15, 2025 · Jul 15, 2025 · Jul 15, 2025
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -344,7 +344,7 @@ workflows:
          branches:
              - master
              - ah_var_store
-             - master_merge_2025_07_16
+             - ploidy_status_rows
          tags:
              - /.*/
    - name: GvsIngestTieout

diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl
@@ -131,7 +131,7 @@ task GetToolVersions {
     String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:524.0.0-slim"
     String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-06-30-alpine-61f482e29ee5"
     String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
-    String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2025-07-16-gatkbase-92f3d8e94af8"
+    String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2025-07-17-gatkbase-04a2f72cdb4b"
     String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"
     String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623"
     String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e"

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java
@@ -170,6 +170,8 @@ public final class CreateVariantIngestFiles extends VariantWalker {
 
     private boolean shouldWriteVCFHeadersLoadedStatusRow = false;
 
+    private boolean shouldWritePloidiesLoadedStatusRow = false;
+
     private final Set<GQStateEnum> gqStatesToIgnore = new HashSet<>();
 
     // getGenotypes() returns list of lists for all samples at variant
@@ -229,6 +231,7 @@ public void onTraversalStart() {
         Boolean refRangesRowsExist = null;
         Boolean vetRowsExist = null;
         Boolean vcfHeaderRowsExist = null;
+        Boolean ploidyRowsExist = null;
 
         // If BQ, check the load status table to see if this sample has already been loaded.
         if (outputType == CommonCode.OutputType.BQ) {
@@ -257,6 +260,20 @@ public void onTraversalStart() {
                     }
                     shouldWriteReferencesLoadedStatusRow = true;
                 }
+
+                if (state.arePloidiesLoaded()) {
+                    logger.info("Sample ID {}: Reference ranges writing enabled but PLOIDIES_LOADED status row found, skipping.", sampleId);
+                } else {
+                    logger.info("Sample ID {}: Reference ranges writing enabled and no PLOIDIES_LOADED status row found, looking for existing rows.", sampleId);
+                    ploidyRowsExist = SamplePloidyCreator.doRowsExistFor(outputType, projectID, datasetName, sampleId);
+                    if (ploidyRowsExist) {
+                        logger.warn("Reference ranges enabled for sample id = {}, name = {} but preexisting ploidy rows found, skipping ploidy data writes.",
+                                sampleId, sampleName);
+                    } else {
+                        logger.info("Sample ID {}: No preexisting ploidy rows found.", sampleId);
+                    }
+                    shouldWritePloidiesLoadedStatusRow = true;
+                }
             }
 
             if (enableVet) {
@@ -299,14 +316,23 @@ public void onTraversalStart() {
         // some class members that the CreateVariantIngestFilesTest expects to be initialized.
         SAMSequenceDictionary seqDictionary = initializeGQConfigurationAndIntervals();
 
-        if (enableReferenceRanges && refRangesRowsExist == Boolean.FALSE) {
-            logger.info("Writing reference range data for sample id = {}, name = {}", sampleId, sampleName);
-            refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences);
+        if (enableReferenceRanges && (refRangesRowsExist == Boolean.FALSE || ploidyRowsExist == Boolean.FALSE)) {
+            // The refCreator is required if either reference ranges or ploidy data is being written, but should only
+            // write reference ranges if there are no preexisting reference range rows for this sample.
+            if (refRangesRowsExist == Boolean.FALSE) {
+                logger.info("Writing reference range data for sample id = {}, name = {}", sampleId, sampleName);
+                refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, true, projectID, datasetName, storeCompressedReferences);
+            } else {
+                logger.info("*Not* writing reference range data for sample id = {}, name = {} but instantiating ref creator for ploidy writing", sampleId, sampleName);
+                refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, false, projectID, datasetName, storeCompressedReferences);
+            }
 
-            // The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypes
-            // directly.  If we're not ingesting references, we can't compute and ingest ploidy either
-            logger.info("Writing ploidy data for sample id = {}, name = {}", sampleId, sampleName);
-            samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName, outputType);
+            if (ploidyRowsExist == Boolean.FALSE) {
+                // The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypes
+                // directly.  If we're not ingesting references, we can't compute and ingest ploidy either
+                logger.info("Writing ploidy data for sample id = {}, name = {}", sampleId, sampleName);
+                samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName, outputType);
+            }
         }
 
         if (enableVet && vetRowsExist == Boolean.FALSE) {
@@ -321,11 +347,11 @@ public void onTraversalStart() {
 
         logger.info("enableReferenceRanges = {}, enableVet = {}, enableVCFHeaders = {}",
                 enableReferenceRanges, enableVet, enableVCFHeaders);
-        logger.info("shouldWriteReferencesLoadedStatus = {}, shouldWriteVariantsLoadedStatus = {}, shouldWriteVCFHeadersLoadedStatus = {}",
-                shouldWriteReferencesLoadedStatusRow, shouldWriteVariantsLoadedStatusRow, shouldWriteVCFHeadersLoadedStatusRow);
+        logger.info("shouldWriteReferencesLoadedStatus = {}, shouldWriteVariantsLoadedStatus = {}, shouldWriteVCFHeadersLoadedStatus = {}, shouldWritePloidiesLoadedStatus = {}",
+                shouldWriteReferencesLoadedStatusRow, shouldWriteVariantsLoadedStatusRow, shouldWriteVCFHeadersLoadedStatusRow, shouldWritePloidiesLoadedStatusRow);
 
-        if (refCreator == null && vetCreator == null && vcfHeaderLineScratchCreator == null &&
-                !shouldWriteReferencesLoadedStatusRow && !shouldWriteVariantsLoadedStatusRow && !shouldWriteVCFHeadersLoadedStatusRow) {
+        if (refCreator == null && vetCreator == null && vcfHeaderLineScratchCreator == null && samplePloidyCreator == null &&
+                !shouldWriteReferencesLoadedStatusRow && !shouldWriteVariantsLoadedStatusRow && !shouldWriteVCFHeadersLoadedStatusRow && !shouldWritePloidiesLoadedStatusRow) {
             logger.info("No data to be written, exiting successfully.");
             System.exit(0);
         }
@@ -403,7 +429,7 @@ public Object onTraversalSuccess() {
             refCreator.commitData();
 
             // this is likely an unnecessary check as it currently stands, but it can't hurt to have it in case we
-            // later separate their creation, throwing the ploidy stuff explicity behind a flag
+            // later separate their creation, throwing the ploidy stuff explicitly behind a flag
             if (samplePloidyCreator != null) {
                 try {
                     samplePloidyCreator.apply(refCreator.getReferencePloidyData(), refCreator.getTotalRefEntries());
@@ -423,6 +449,7 @@ public Object onTraversalSuccess() {
             if (shouldWriteVCFHeadersLoadedStatusRow) loadStatus.writeHeadersLoadedStatus(sampleId);
             if (shouldWriteVariantsLoadedStatusRow) loadStatus.writeVariantsLoadedStatus(sampleId);
             if (shouldWriteReferencesLoadedStatusRow) loadStatus.writeReferencesLoadedStatus(sampleId);
+            if (shouldWritePloidiesLoadedStatusRow) loadStatus.writePloidiesLoadedStatus(sampleId);
         }
 
         return 0;

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/LoadStatus.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/LoadStatus.java
@@ -24,7 +24,7 @@ public class LoadStatus {
 
     // Preserve legacy 'STARTED' and 'FINISHED' states in the enum so this code doesn't crash if asked to convert one
     // of these states from a stringified representation.
-    private enum LoadStatusValue { STARTED, FINISHED, REFERENCES_LOADED, VARIANTS_LOADED, HEADERS_LOADED }
+    private enum LoadStatusValue { STARTED, FINISHED, REFERENCES_LOADED, VARIANTS_LOADED, HEADERS_LOADED, PLOIDIES_LOADED }
     public static class LoadState {
         private final Set<LoadStatusValue> statusValues;
 
@@ -49,6 +49,10 @@ public boolean areVariantsLoaded() {
         public boolean areHeadersLoaded() {
             return statusValues.contains(LoadStatusValue.HEADERS_LOADED);
         }
+
+        public boolean arePloidiesLoaded() {
+            return statusValues.contains(LoadStatusValue.PLOIDIES_LOADED);
+        }
     }
 
     private final String projectID;
@@ -107,6 +111,10 @@ public void writeHeadersLoadedStatus(long sampleId) {
         writeLoadStatus(LoadStatusValue.HEADERS_LOADED, sampleId);
     }
 
+    public void writePloidiesLoadedStatus(long sampleId) {
+        writeLoadStatus(LoadStatusValue.PLOIDIES_LOADED, sampleId);
+    }
+
     protected void writeLoadStatus(LoadStatusValue status, long sampleId) {
         final ExponentialBackOff backoff = new ExponentialBackOff.Builder().
                 setInitialIntervalMillis(2000).

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java
@@ -85,22 +85,20 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin
         coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));
 
         try {
-            if (writeReferenceRanges) {
-                final File refOutputFile = new File(outputDirectory, REF_RANGES_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + "." + outputType.toString().toLowerCase());
-                switch (outputType) {
-                    case BQ:
-                        if (projectId == null || datasetName == null) {
-                            throw new UserException("Must specify project-id and dataset-name when using BQ output mode.");
-                        }
-                        refRangesWriter = new RefRangesBQWriter(projectId, datasetName,REF_RANGES_FILETYPE_PREFIX + tableNumber);
-                        break;
-                    case TSV:
-                        refRangesWriter = new RefRangesTsvWriter(refOutputFile.getCanonicalPath());
-                        break;
-                    case AVRO:
-                        refRangesWriter = new RefRangesAvroWriter(refOutputFile.getCanonicalPath());
-                        break;
-                }
+            final File refOutputFile = new File(outputDirectory, REF_RANGES_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + "." + outputType.toString().toLowerCase());
+            switch (outputType) {
+                case BQ:
+                    if (projectId == null || datasetName == null) {
+                        throw new UserException("Must specify project-id and dataset-name when using BQ output mode.");
+                    }
+                    refRangesWriter = new RefRangesBQWriter(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber);
+                    break;
+                case TSV:
+                    refRangesWriter = new RefRangesTsvWriter(refOutputFile.getCanonicalPath());
+                    break;
+                case AVRO:
+                    refRangesWriter = new RefRangesAvroWriter(refOutputFile.getCanonicalPath());
+                    break;
             }
         } catch (final IOException ioex) {
             throw new UserException("Could not create reference range outputs", ioex);
@@ -130,36 +128,37 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
                 setCoveredInterval(variantChr, start, end);
 
                 // if we are writing ref ranges, and this is a reference block, write it!
-                if (writeReferenceRanges) {
-                    if (variant.isReferenceBlock()) {
+                if (variant.isReferenceBlock()) {
 
-                        // Record reference ploidy if this is not in a PAR
-                        if (!PloidyUtils.doesVariantOverlapPAR(variant)) {
-                            // create the bitset for this ploidy if it isn't there
-                            if (!ploidiesCountPerChromosome.containsKey(variant.getContig())) {
-                                ploidiesCountPerChromosome.put(variant.getContig(), new HashMap<>());
-                            }
-                            // set the bit for this ploidy so we record having seen it
-                            Map<Integer, Long> ploidyCounts = ploidiesCountPerChromosome.get(variant.getContig());
-
-                            int ploidy = variant.getMaxPloidy(1);
-
-                            Long currentCount = 0L;
-                            if (ploidyCounts.containsKey(ploidy)) {
-                                currentCount = ploidyCounts.get(ploidy);
-                            }
+                    // Record reference ploidy if this is not in a PAR
+                    if (!PloidyUtils.doesVariantOverlapPAR(variant)) {
+                        // create the bitset for this ploidy if it isn't there
+                        if (!ploidiesCountPerChromosome.containsKey(variant.getContig())) {
+                            ploidiesCountPerChromosome.put(variant.getContig(), new HashMap<>());
+                        }
+                        // set the bit for this ploidy so we record having seen it
+                        Map<Integer, Long> ploidyCounts = ploidiesCountPerChromosome.get(variant.getContig());
 
-                            // increment counts for this one and put it back
-                            ++currentCount;
-                            ploidyCounts.put(ploidy, currentCount);
+                        int ploidy = variant.getMaxPloidy(1);
 
-                            ++totalRefEntries;
+                        Long currentCount = 0L;
+                        if (ploidyCounts.containsKey(ploidy)) {
+                            currentCount = ploidyCounts.get(ploidy);
                         }
 
+                        // increment counts for this one and put it back
+                        ++currentCount;
+                        ploidyCounts.put(ploidy, currentCount);
 
+                        ++totalRefEntries;
+                    }
+                }
+
+                if (writeReferenceRanges) {
+                    if (variant.isReferenceBlock()) {
                         // break up reference blocks to be no longer than MAX_REFERENCE_BLOCK_SIZE
                         int localStart = start;
-                        while ( localStart <= end ) {
+                        while (localStart <= end) {
                             int length = Math.min(end - localStart + 1, IngestConstants.MAX_REFERENCE_BLOCK_BASES);
                             if (storeCompressedReferences) {
                                 refRangesWriter.writeCompressed(
@@ -175,12 +174,11 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
                                 );
                             }
 
-                            localStart = localStart + length ;
+                            localStart = localStart + length;
                         }
-
-                    // Write out no-calls as a single-base GQ0 reference.
-                    // UNLESS we are ignoring GQ0, in which case ignore them too.
                     } else if (CreateVariantIngestFiles.isNoCall(variant) && (!this.gqStatesToIgnore.contains(GQStateEnum.ZERO))) {
+                        // Write out no-calls as a single-base GQ0 reference.
+                        // UNLESS we are ignoring GQ0, in which case ignore them too.
                         if (storeCompressedReferences) {
                             refRangesWriter.writeCompressed(
                                     SchemaUtils.encodeCompressedRefBlock(variantChr, start, 1,
@@ -201,16 +199,18 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
     }
 
     public void writeMissingIntervals(GenomeLocSortedSet intervalArgumentGenomeLocSortedSet) throws IOException {
-        GenomeLocSortedSet uncoveredIntervals = intervalArgumentGenomeLocSortedSet.subtractRegions(coverageLocSortedSet);
-        logger.info("MISSING_GREP_HERE:" + uncoveredIntervals.coveredSize());
-        logger.info("MISSING_PERCENTAGE_GREP_HERE:" + (1.0 * uncoveredIntervals.coveredSize()) / intervalArgumentGenomeLocSortedSet.coveredSize());
-        // for each block of uncovered locations
-        for (GenomeLoc genomeLoc : uncoveredIntervals) {
-            final String contig = genomeLoc.getContig();
-            // write all positions in this block
-            writeMissingPositions(
-                    SchemaUtils.encodeLocation(contig, genomeLoc.getStart()),
-                    SchemaUtils.encodeLocation(contig, genomeLoc.getEnd()));
+        if (writeReferenceRanges) {
+            GenomeLocSortedSet uncoveredIntervals = intervalArgumentGenomeLocSortedSet.subtractRegions(coverageLocSortedSet);
+            logger.info("MISSING_GREP_HERE:" + uncoveredIntervals.coveredSize());
+            logger.info("MISSING_PERCENTAGE_GREP_HERE:" + (1.0 * uncoveredIntervals.coveredSize()) / intervalArgumentGenomeLocSortedSet.coveredSize());
+            // for each block of uncovered locations
+            for (GenomeLoc genomeLoc : uncoveredIntervals) {
+                final String contig = genomeLoc.getContig();
+                // write all positions in this block
+                writeMissingPositions(
+                        SchemaUtils.encodeLocation(contig, genomeLoc.getStart()),
+                        SchemaUtils.encodeLocation(contig, genomeLoc.getEnd()));
+            }
         }
     }
 

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java
@@ -6,14 +6,12 @@
 import org.broadinstitute.hellbender.exceptions.UserException;
 import org.broadinstitute.hellbender.tools.gvs.common.CommonCode;
 import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
-import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.gvs.bigquery.BigQueryUtils;
 import org.broadinstitute.hellbender.utils.gvs.bigquery.PendingBQWriter;
 import org.json.JSONObject;
 
 import java.io.IOException;
-import java.util.BitSet;
 import java.util.Map;
-import java.util.Set;
 import java.util.concurrent.ExecutionException;
 
 public class SamplePloidyCreator {
@@ -52,6 +50,10 @@ public SamplePloidyCreator(Long sampleId, String projectId, String datasetName,
         }
     }
 
+    public static boolean doRowsExistFor(CommonCode.OutputType outputType, String projectId, String datasetName, Long sampleId) {
+        if (outputType != CommonCode.OutputType.BQ) return false;
+        return BigQueryUtils.doRowsExistFor(projectId, datasetName, SAMPLE_PLOIDY_TABLE_NAME, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
+    }
 
     public void apply(Map<String, Map<Integer, Long>> ploidyData, long totalRefEntries) throws IOException {
         for (final Map.Entry<String, Map<Integer, Long>> ploidyLine : ploidyData.entrySet()) {