Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ workflows:
branches:
- master
- ah_var_store
- master_merge_2025_07_16
- ploidy_status_rows
tags:
- /.*/
- name: GvsIngestTieout
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ task GetToolVersions {
String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:524.0.0-slim"
String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-06-30-alpine-61f482e29ee5"
String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2025-07-16-gatkbase-92f3d8e94af8"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2025-07-17-gatkbase-04a2f72cdb4b"
String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"
String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623"
String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ public final class CreateVariantIngestFiles extends VariantWalker {

private boolean shouldWriteVCFHeadersLoadedStatusRow = false;

private boolean shouldWritePloidiesLoadedStatusRow = false;

private final Set<GQStateEnum> gqStatesToIgnore = new HashSet<>();

// getGenotypes() returns list of lists for all samples at variant
Expand Down Expand Up @@ -229,6 +231,7 @@ public void onTraversalStart() {
Boolean refRangesRowsExist = null;
Boolean vetRowsExist = null;
Boolean vcfHeaderRowsExist = null;
Boolean ploidyRowsExist = null;

// If BQ, check the load status table to see if this sample has already been loaded.
if (outputType == CommonCode.OutputType.BQ) {
Expand Down Expand Up @@ -257,6 +260,20 @@ public void onTraversalStart() {
}
shouldWriteReferencesLoadedStatusRow = true;
}

if (state.arePloidiesLoaded()) {
logger.info("Sample ID {}: Reference ranges writing enabled but PLOIDIES_LOADED status row found, skipping.", sampleId);
} else {
logger.info("Sample ID {}: Reference ranges writing enabled and no PLOIDIES_LOADED status row found, looking for existing rows.", sampleId);
ploidyRowsExist = SamplePloidyCreator.doRowsExistFor(outputType, projectID, datasetName, sampleId);
if (ploidyRowsExist) {
logger.warn("Reference ranges enabled for sample id = {}, name = {} but preexisting ploidy rows found, skipping ploidy data writes.",
sampleId, sampleName);
} else {
logger.info("Sample ID {}: No preexisting ploidy rows found.", sampleId);
}
shouldWritePloidiesLoadedStatusRow = true;
}
}

if (enableVet) {
Expand Down Expand Up @@ -299,14 +316,23 @@ public void onTraversalStart() {
// some class members that the CreateVariantIngestFilesTest expects to be initialized.
SAMSequenceDictionary seqDictionary = initializeGQConfigurationAndIntervals();

if (enableReferenceRanges && refRangesRowsExist == Boolean.FALSE) {
logger.info("Writing reference range data for sample id = {}, name = {}", sampleId, sampleName);
refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences);
if (enableReferenceRanges && (refRangesRowsExist == Boolean.FALSE || ploidyRowsExist == Boolean.FALSE)) {
// The refCreator is required if either reference ranges or ploidy data is being written, but should only
// write reference ranges if there are no preexisting reference range rows for this sample.
if (refRangesRowsExist == Boolean.FALSE) {
logger.info("Writing reference range data for sample id = {}, name = {}", sampleId, sampleName);
refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, true, projectID, datasetName, storeCompressedReferences);
} else {
logger.info("*Not* writing reference range data for sample id = {}, name = {} but instantiating ref creator for ploidy writing", sampleId, sampleName);
refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, false, projectID, datasetName, storeCompressedReferences);
}

// The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypes
// directly. If we're not ingesting references, we can't compute and ingest ploidy either
logger.info("Writing ploidy data for sample id = {}, name = {}", sampleId, sampleName);
samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName, outputType);
if (ploidyRowsExist == Boolean.FALSE) {
// The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypes
// directly. If we're not ingesting references, we can't compute and ingest ploidy either
logger.info("Writing ploidy data for sample id = {}, name = {}", sampleId, sampleName);
samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName, outputType);
}
}

if (enableVet && vetRowsExist == Boolean.FALSE) {
Expand All @@ -321,11 +347,11 @@ public void onTraversalStart() {

logger.info("enableReferenceRanges = {}, enableVet = {}, enableVCFHeaders = {}",
enableReferenceRanges, enableVet, enableVCFHeaders);
logger.info("shouldWriteReferencesLoadedStatus = {}, shouldWriteVariantsLoadedStatus = {}, shouldWriteVCFHeadersLoadedStatus = {}",
shouldWriteReferencesLoadedStatusRow, shouldWriteVariantsLoadedStatusRow, shouldWriteVCFHeadersLoadedStatusRow);
logger.info("shouldWriteReferencesLoadedStatus = {}, shouldWriteVariantsLoadedStatus = {}, shouldWriteVCFHeadersLoadedStatus = {}, shouldWritePloidiesLoadedStatus = {}",
shouldWriteReferencesLoadedStatusRow, shouldWriteVariantsLoadedStatusRow, shouldWriteVCFHeadersLoadedStatusRow, shouldWritePloidiesLoadedStatusRow);

if (refCreator == null && vetCreator == null && vcfHeaderLineScratchCreator == null &&
!shouldWriteReferencesLoadedStatusRow && !shouldWriteVariantsLoadedStatusRow && !shouldWriteVCFHeadersLoadedStatusRow) {
if (refCreator == null && vetCreator == null && vcfHeaderLineScratchCreator == null && samplePloidyCreator == null &&
!shouldWriteReferencesLoadedStatusRow && !shouldWriteVariantsLoadedStatusRow && !shouldWriteVCFHeadersLoadedStatusRow && !shouldWritePloidiesLoadedStatusRow) {
logger.info("No data to be written, exiting successfully.");
System.exit(0);
}
Expand Down Expand Up @@ -403,7 +429,7 @@ public Object onTraversalSuccess() {
refCreator.commitData();

// this is likely an unnecessary check as it currently stands, but it can't hurt to have it in case we
// later separate their creation, throwing the ploidy stuff explicity behind a flag
// later separate their creation, throwing the ploidy stuff explicitly behind a flag
if (samplePloidyCreator != null) {
try {
samplePloidyCreator.apply(refCreator.getReferencePloidyData(), refCreator.getTotalRefEntries());
Expand All @@ -423,6 +449,7 @@ public Object onTraversalSuccess() {
if (shouldWriteVCFHeadersLoadedStatusRow) loadStatus.writeHeadersLoadedStatus(sampleId);
if (shouldWriteVariantsLoadedStatusRow) loadStatus.writeVariantsLoadedStatus(sampleId);
if (shouldWriteReferencesLoadedStatusRow) loadStatus.writeReferencesLoadedStatus(sampleId);
if (shouldWritePloidiesLoadedStatusRow) loadStatus.writePloidiesLoadedStatus(sampleId);
}

return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public class LoadStatus {

// Preserve legacy 'STARTED' and 'FINISHED' states in the enum so this code doesn't crash if asked to convert one
// of these states from a stringified representation.
private enum LoadStatusValue { STARTED, FINISHED, REFERENCES_LOADED, VARIANTS_LOADED, HEADERS_LOADED }
private enum LoadStatusValue { STARTED, FINISHED, REFERENCES_LOADED, VARIANTS_LOADED, HEADERS_LOADED, PLOIDIES_LOADED }
public static class LoadState {
private final Set<LoadStatusValue> statusValues;

Expand All @@ -49,6 +49,10 @@ public boolean areVariantsLoaded() {
public boolean areHeadersLoaded() {
return statusValues.contains(LoadStatusValue.HEADERS_LOADED);
}

public boolean arePloidiesLoaded() {
return statusValues.contains(LoadStatusValue.PLOIDIES_LOADED);
}
}

private final String projectID;
Expand Down Expand Up @@ -107,6 +111,10 @@ public void writeHeadersLoadedStatus(long sampleId) {
writeLoadStatus(LoadStatusValue.HEADERS_LOADED, sampleId);
}

public void writePloidiesLoadedStatus(long sampleId) {
writeLoadStatus(LoadStatusValue.PLOIDIES_LOADED, sampleId);
}

protected void writeLoadStatus(LoadStatusValue status, long sampleId) {
final ExponentialBackOff backoff = new ExponentialBackOff.Builder().
setInitialIntervalMillis(2000).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,22 +85,20 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin
coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));

try {
if (writeReferenceRanges) {
final File refOutputFile = new File(outputDirectory, REF_RANGES_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + "." + outputType.toString().toLowerCase());
switch (outputType) {
case BQ:
if (projectId == null || datasetName == null) {
throw new UserException("Must specify project-id and dataset-name when using BQ output mode.");
}
refRangesWriter = new RefRangesBQWriter(projectId, datasetName,REF_RANGES_FILETYPE_PREFIX + tableNumber);
break;
case TSV:
refRangesWriter = new RefRangesTsvWriter(refOutputFile.getCanonicalPath());
break;
case AVRO:
refRangesWriter = new RefRangesAvroWriter(refOutputFile.getCanonicalPath());
break;
}
final File refOutputFile = new File(outputDirectory, REF_RANGES_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + "." + outputType.toString().toLowerCase());
switch (outputType) {
case BQ:
if (projectId == null || datasetName == null) {
throw new UserException("Must specify project-id and dataset-name when using BQ output mode.");
}
refRangesWriter = new RefRangesBQWriter(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber);
break;
case TSV:
refRangesWriter = new RefRangesTsvWriter(refOutputFile.getCanonicalPath());
break;
case AVRO:
refRangesWriter = new RefRangesAvroWriter(refOutputFile.getCanonicalPath());
break;
}
} catch (final IOException ioex) {
throw new UserException("Could not create reference range outputs", ioex);
Expand Down Expand Up @@ -130,36 +128,37 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
setCoveredInterval(variantChr, start, end);

// if we are writing ref ranges, and this is a reference block, write it!
if (writeReferenceRanges) {
if (variant.isReferenceBlock()) {
if (variant.isReferenceBlock()) {

// Record reference ploidy if this is not in a PAR
if (!PloidyUtils.doesVariantOverlapPAR(variant)) {
// create the bitset for this ploidy if it isn't there
if (!ploidiesCountPerChromosome.containsKey(variant.getContig())) {
ploidiesCountPerChromosome.put(variant.getContig(), new HashMap<>());
}
// set the bit for this ploidy so we record having seen it
Map<Integer, Long> ploidyCounts = ploidiesCountPerChromosome.get(variant.getContig());

int ploidy = variant.getMaxPloidy(1);

Long currentCount = 0L;
if (ploidyCounts.containsKey(ploidy)) {
currentCount = ploidyCounts.get(ploidy);
}
// Record reference ploidy if this is not in a PAR
if (!PloidyUtils.doesVariantOverlapPAR(variant)) {
// create the bitset for this ploidy if it isn't there
if (!ploidiesCountPerChromosome.containsKey(variant.getContig())) {
ploidiesCountPerChromosome.put(variant.getContig(), new HashMap<>());
}
// set the bit for this ploidy so we record having seen it
Map<Integer, Long> ploidyCounts = ploidiesCountPerChromosome.get(variant.getContig());

// increment counts for this one and put it back
++currentCount;
ploidyCounts.put(ploidy, currentCount);
int ploidy = variant.getMaxPloidy(1);

++totalRefEntries;
Long currentCount = 0L;
if (ploidyCounts.containsKey(ploidy)) {
currentCount = ploidyCounts.get(ploidy);
}

// increment counts for this one and put it back
++currentCount;
ploidyCounts.put(ploidy, currentCount);

++totalRefEntries;
}
}

if (writeReferenceRanges) {
if (variant.isReferenceBlock()) {
// break up reference blocks to be no longer than MAX_REFERENCE_BLOCK_SIZE
int localStart = start;
while ( localStart <= end ) {
while (localStart <= end) {
int length = Math.min(end - localStart + 1, IngestConstants.MAX_REFERENCE_BLOCK_BASES);
if (storeCompressedReferences) {
refRangesWriter.writeCompressed(
Expand All @@ -175,12 +174,11 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
);
}

localStart = localStart + length ;
localStart = localStart + length;
}

// Write out no-calls as a single-base GQ0 reference.
// UNLESS we are ignoring GQ0, in which case ignore them too.
} else if (CreateVariantIngestFiles.isNoCall(variant) && (!this.gqStatesToIgnore.contains(GQStateEnum.ZERO))) {
// Write out no-calls as a single-base GQ0 reference.
// UNLESS we are ignoring GQ0, in which case ignore them too.
if (storeCompressedReferences) {
refRangesWriter.writeCompressed(
SchemaUtils.encodeCompressedRefBlock(variantChr, start, 1,
Expand All @@ -201,16 +199,18 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
}

public void writeMissingIntervals(GenomeLocSortedSet intervalArgumentGenomeLocSortedSet) throws IOException {
GenomeLocSortedSet uncoveredIntervals = intervalArgumentGenomeLocSortedSet.subtractRegions(coverageLocSortedSet);
logger.info("MISSING_GREP_HERE:" + uncoveredIntervals.coveredSize());
logger.info("MISSING_PERCENTAGE_GREP_HERE:" + (1.0 * uncoveredIntervals.coveredSize()) / intervalArgumentGenomeLocSortedSet.coveredSize());
// for each block of uncovered locations
for (GenomeLoc genomeLoc : uncoveredIntervals) {
final String contig = genomeLoc.getContig();
// write all positions in this block
writeMissingPositions(
SchemaUtils.encodeLocation(contig, genomeLoc.getStart()),
SchemaUtils.encodeLocation(contig, genomeLoc.getEnd()));
if (writeReferenceRanges) {
GenomeLocSortedSet uncoveredIntervals = intervalArgumentGenomeLocSortedSet.subtractRegions(coverageLocSortedSet);
logger.info("MISSING_GREP_HERE:" + uncoveredIntervals.coveredSize());
logger.info("MISSING_PERCENTAGE_GREP_HERE:" + (1.0 * uncoveredIntervals.coveredSize()) / intervalArgumentGenomeLocSortedSet.coveredSize());
// for each block of uncovered locations
for (GenomeLoc genomeLoc : uncoveredIntervals) {
final String contig = genomeLoc.getContig();
// write all positions in this block
writeMissingPositions(
SchemaUtils.encodeLocation(contig, genomeLoc.getStart()),
SchemaUtils.encodeLocation(contig, genomeLoc.getEnd()));
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,12 @@
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.gvs.common.CommonCode;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gvs.bigquery.BigQueryUtils;
import org.broadinstitute.hellbender.utils.gvs.bigquery.PendingBQWriter;
import org.json.JSONObject;

import java.io.IOException;
import java.util.BitSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutionException;

public class SamplePloidyCreator {
Expand Down Expand Up @@ -52,6 +50,10 @@ public SamplePloidyCreator(Long sampleId, String projectId, String datasetName,
}
}

public static boolean doRowsExistFor(CommonCode.OutputType outputType, String projectId, String datasetName, Long sampleId) {
if (outputType != CommonCode.OutputType.BQ) return false;
return BigQueryUtils.doRowsExistFor(projectId, datasetName, SAMPLE_PLOIDY_TABLE_NAME, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
}

public void apply(Map<String, Map<Integer, Long>> ploidyData, long totalRefEntries) throws IOException {
for (final Map.Entry<String, Map<Integer, Long>> ploidyLine : ploidyData.entrySet()) {
Expand Down
Loading