Skip to content

Commit da7cd83

Browse files
authored
Add new max alt argument for GenomicsDB and require it to be >= GGVCFs (#7655)
1 parent d373a50 commit da7cd83

File tree

17 files changed

+293
-179
lines changed

17 files changed

+293
-179
lines changed

src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBArgumentCollection.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,27 @@ public class GenomicsDBArgumentCollection implements Serializable {
1313
public static final String USE_GCS_HDFS_CONNECTOR = "genomicsdb-use-gcs-hdfs-connector";
1414

1515
public static final String CALL_GENOTYPES_LONG_NAME = "call-genotypes";
16+
public static final String MAX_ALTS_LONG_NAME = "genomicsdb-max-alternate-alleles";
1617
private static final boolean DEFAULT_CALL_GENOTYPES = false;
1718
private static final boolean DEFAULT_USE_BCF_CODEC = false;
1819
private static final boolean DEFAULT_SHARED_POSIXFS_OPTIMIZATIONS = false;
1920
private static final boolean DEFAULT_USE_GCS_HDFS_CONNECTOR = false;
2021

2122
/**
22-
* Not full-fledged arguments for now.
23+
* Maximum number of alternate alleles that will report likelihoods after being combined on reading from GenomicsDB (including <NON_REF>)
24+
* Must be at least one greater than the maximum number of alternate alleles for genotyping.
25+
* A typical value is 3 more than the --max-alternate-alleles value that's used by GenotypeGVCFs and larger differences
26+
* result in more robustness to PCR-related indel errors.
27+
* Note that GenotypeGVCFs will drop highly multi-allelic sites that are missing likelihoods.
28+
*
29+
* See also {@link org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection#MAX_ALTERNATE_ALLELES_LONG_NAME}
2330
*/
31+
@Argument(fullName = MAX_ALTS_LONG_NAME, doc = "Maximum number of alternate alleles that will be combined on reading from GenomicsDB")
2432
public int maxDiploidAltAllelesThatCanBeGenotyped = GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED;
2533

34+
/**
35+
* Output called genotypes in the final VCF (otherwise no-call)
36+
*/
2637
@Argument(fullName = CALL_GENOTYPES_LONG_NAME, doc = "Output called genotypes in final VCF (otherwise no-call)", optional = true)
2738
public boolean callGenotypes = DEFAULT_CALL_GENOTYPES;
2839

src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBOptions.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.broadinstitute.hellbender.tools.genomicsdb;
22

3+
import org.broadinstitute.hellbender.exceptions.UserException;
34
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
45

56
import java.nio.file.Path;
@@ -35,12 +36,16 @@ public GenomicsDBOptions(final Path reference, final GenomicsDBArgumentCollectio
3536
this.useBCFCodec = genomicsdbArgs.useBCFCodec;
3637
this.sharedPosixFSOptimizations = genomicsdbArgs.sharedPosixFSOptimizations;
3738
this.useGcsHdfsConnector = genomicsdbArgs.useGcsHdfsConnector;
39+
if (genomicsdbArgs.maxDiploidAltAllelesThatCanBeGenotyped - 1 < genotypeCalcArgs.maxAlternateAlleles) { //-1 for <NON_REF>
40+
throw new UserException.BadInput("GenomicsDB max alternate alleles (" + GenomicsDBArgumentCollection.MAX_ALTS_LONG_NAME
41+
+ ") must be at least one greater than genotype calculation max alternate alleles ("
42+
+ GenotypeCalculationArgumentCollection.MAX_ALTERNATE_ALLELES_LONG_NAME + "), accounting for the non-ref allele");
43+
}
44+
this.maxDiploidAltAllelesThatCanBeGenotyped = genomicsdbArgs.maxDiploidAltAllelesThatCanBeGenotyped;
3845
if (genotypeCalcArgs != null) {
39-
this.maxDiploidAltAllelesThatCanBeGenotyped = genotypeCalcArgs.maxAlternateAlleles;
4046
this.maxGenotypeCount = genotypeCalcArgs.maxGenotypeCount;
4147
} else {
4248
// Some defaults
43-
this.maxDiploidAltAllelesThatCanBeGenotyped = genomicsdbArgs.maxDiploidAltAllelesThatCanBeGenotyped;
4449
this.maxGenotypeCount = GenotypeCalculationArgumentCollection.DEFAULT_MAX_GENOTYPE_COUNT;
4550
}
4651
}

src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,12 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
8282
MathUtils.scaleLogSpaceArrayForNumericalStability(Arrays.stream(subsettedLikelihoodIndices)
8383
.mapToDouble(idx -> originalLikelihoods[idx]).toArray()) : null;
8484
if (newLikelihoods != null) {
85-
final int PLindex = MathUtils.maxElementIndex(newLikelihoods);
86-
newLog10GQ = GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods);
85+
if (newLikelihoods.length > 1) {
86+
final int PLindex = MathUtils.maxElementIndex(newLikelihoods); //pick out the call (log10L = 0)
87+
newLog10GQ = GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods);
88+
} else { //if we subset to just ref allele, keep the GQ
89+
newLog10GQ = g.getGQ()/-10.0; //-10 to go from Phred to log space
90+
}
8791
}
8892

8993
} else if (g.hasGQ()) {
@@ -99,8 +103,8 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
99103
//TODO: remove other G-length attributes, although that may require header parsing
100104
attributes.remove(VCFConstants.GENOTYPE_POSTERIORS_KEY);
101105
attributes.remove(GATKVCFConstants.GENOTYPE_PRIOR_KEY);
102-
gb.noAttributes().attributes(attributes);
103-
if (newLog10GQ != Double.NEGATIVE_INFINITY) {
106+
gb.noPL().noGQ().noAttributes().attributes(attributes); //if alleles are subset, old PLs and GQ are invalid
107+
if (newLog10GQ != Double.NEGATIVE_INFINITY && g.hasGQ()) { //only put GQ if originally present
104108
gb.log10PError(newLog10GQ);
105109
}
106110
if (useNewLikelihoods) {

src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeCalculationArgumentCollection.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,18 +131,22 @@ public GenotypeCalculationArgumentCollection clone() {
131131
/**
132132
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
133133
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
134-
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
134+
* scales exponentially based on the number of alternate alleles.
135+
*
136+
* Unless there is a good reason to change the default value, we highly recommend
135137
* that you not play around with this parameter.
136138
*
137-
* See also {@link #maxGenotypeCount}.
139+
* See also {@link #maxGenotypeCount} and {@link org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection#MAX_ALTS_LONG_NAME}.
140+
* This value can be no greater than one less than the corresponding GenomicsDB argument. Sites that exceed the
141+
* GenomicsDB alt allele max will not be output with likelihoods and will be dropped by GenotypeGVCFs.
138142
*/
139143
@Advanced
140144
@Argument(fullName = MAX_ALTERNATE_ALLELES_LONG_NAME, doc = "Maximum number of alternate alleles to genotype", optional = true)
141145
public int maxAlternateAlleles = DEFAULT_MAX_ALTERNATE_ALLELES;
142146

143147
/**
144148
* If there are more than this number of genotypes at a locus presented to the genotyper, then only this many genotypes will be used.
145-
* The possible genotypes are simply different ways of partitioning alleles given a specific ploidy asumption.
149+
* The possible genotypes are simply different ways of partitioning alleles given a specific ploidy assumption.
146150
* Therefore, we remove genotypes from consideration by removing alternate alleles that are the least well supported.
147151
* The estimate of allele support is based on the ranking of the candidate haplotypes coming out of the graph building step.
148152
* Note that the reference allele is always kept.
@@ -154,7 +158,8 @@ public GenotypeCalculationArgumentCollection clone() {
154158
* 1. the largest number of alt alleles, given ploidy, that yields a genotype count no higher than {@link #maxGenotypeCount}
155159
* 2. the value of {@link #maxAlternateAlleles}
156160
*
157-
* See also {@link #maxAlternateAlleles}.
161+
* See also {@link #maxAlternateAlleles} and
162+
* {@link org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection#maxDiploidAltAllelesThatCanBeGenotyped}
158163
*/
159164
@Advanced
160165
@Argument(fullName = MAX_GENOTYPE_COUNT_LONG_NAME, doc = "Maximum number of genotypes to consider at any site", optional = true)

src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ public Config getConfiguration() {
112112
}
113113

114114
/**
115-
* Main entry function to calculate genotypes of a given VC with corresponding GL's that is shared across genotypers (namely UG and HC).
115+
* Main entry function to calculate genotypes of a given VC with corresponding GLs that is shared across genotypers (namely GGVCFs and HC).
116116
*
117117
* Completes a variant context with genotype calls and associated annotations given the genotype likelihoods and
118118
* the model that need to be applied.
@@ -122,7 +122,7 @@ public Config getConfiguration() {
122122
*/
123123
public VariantContext calculateGenotypes(final VariantContext vc, final GenotypePriorCalculator gpc, final List<VariantContext> givenAlleles) {
124124
// if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call
125-
if (hasTooManyAlternativeAlleles(vc) || vc.getNSamples() == 0) {
125+
if (cannotBeGenotyped(vc) || vc.getNSamples() == 0) {
126126
return null;
127127
}
128128

@@ -390,14 +390,21 @@ boolean isVcCoveredByDeletion(final VariantContext vc) {
390390
* @return {@code true} iff there is too many alternative alleles based on
391391
* {@link GenotypeLikelihoods#MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED}.
392392
*/
393-
protected final boolean hasTooManyAlternativeAlleles(final VariantContext vc) {
393+
protected final boolean cannotBeGenotyped(final VariantContext vc) {
394394
// protect against too many alternate alleles that we can't even run AF on:
395-
if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) {
395+
if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED
396+
&& vc.getGenotypes().stream().anyMatch(GenotypeUtils::genotypeIsUsableForAFCalculation)) {
396397
return false;
397398
}
398-
logger.warn("Attempting to genotype more than " + GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
399-
" alleles. Site will be skipped at location "+vc.getContig()+":"+vc.getStart());
400-
return true;
399+
if (vc.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) {
400+
logger.warn("Attempting to genotype more than " + GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
401+
" alleles. Site will be skipped at location " + vc.getContig() + ":" + vc.getStart());
402+
return true;
403+
}else {
404+
logger.warn("No genotype contained sufficient data to recalculate genotypes. Site will be skipped at location "
405+
+ vc.getContig() + ":" + vc.getStart());
406+
return true;
407+
}
401408
}
402409

403410
/**

src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,9 @@ public static AlleleFrequencyCalculator makeCalculator(final DragstrParams drags
6666

6767
/**
6868
*
69-
* @param g must have likelihoods or (if approximateHomRefsFromGQ is true) GQ
69+
* @param g must have likelihoods or (if approximateHomRefsFromGQ is true) be hom-ref with GQ
70+
* (see {@link org.broadinstitute.hellbender.utils.GenotypeUtils#genotypeIsUsableForAFCalculation(Genotype)
71+
* genotypeIsUsableForAFCalculation} )
7072
* @param glCalc
7173
* @param log10AlleleFrequencies
7274
* @return
@@ -75,7 +77,7 @@ private static double[] log10NormalizedGenotypePosteriors(final Genotype g, fina
7577
final double[] log10Likelihoods;
7678
if (g.hasLikelihoods()) {
7779
log10Likelihoods = g.getLikelihoods().getAsVector();
78-
} else if ( g.isHomRef() || g.isNoCall()) {
80+
} else if (g.isHomRef()) {
7981
if (g.getPloidy() != 2) {
8082
throw new IllegalStateException("Likelihoods are required to calculate posteriors for hom-refs with ploidy != 2, " +
8183
"but were not found for genotype " + g + " with ploidy " + g.getPloidy());

src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,8 @@ public final class SelectVariants extends VariantWalker {
212212
* When this flag is enabled, all alternate alleles that are not present in the (output) samples will be removed.
213213
* Note that this even extends to biallelic SNPs - if the alternate allele is not present in any sample, it will be
214214
* removed and the record will contain a '.' in the ALT column. Note also that sites-only VCFs, by definition, do
215-
* not include the alternate allele in any genotype calls.
215+
* not include the alternate allele in any genotype calls. Further note that PLs will be trimmed appropriately,
216+
* removing likelihood information (even for homozygous reference calls).
216217
*/
217218
@Argument(fullName="remove-unused-alternates",
218219
doc="Remove alternate alleles not present in any genotypes", optional=true)
@@ -1030,7 +1031,9 @@ private VariantContext subsetRecord(final VariantContext vc, final boolean prese
10301031
// fix the PL and AD values if sub has fewer alleles than original vc
10311032
final GenotypesContext subGenotypesWithOldAlleles = sub.getGenotypes(); //we need sub for the right samples, but PLs still go with old alleles
10321033
newGC = sub.getNAlleles() == vc.getNAlleles() ? subGenotypesWithOldAlleles :
1033-
AlleleSubsettingUtils.subsetAlleles(subGenotypesWithOldAlleles, 0, vc.getAlleles(), sub.getAlleles(), null, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES, vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), true);
1034+
AlleleSubsettingUtils.subsetAlleles(subGenotypesWithOldAlleles, 0, vc.getAlleles(),
1035+
sub.getAlleles(), null, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES,
1036+
vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), false);
10341037
} else {
10351038
newGC = sub.getGenotypes();
10361039
}

src/main/java/org/broadinstitute/hellbender/utils/GenotypeUtils.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,14 @@ public static GenotypeCounts computeDiploidGenotypeCounts(final VariantContext v
136136
return new GenotypeCounts(genotypeWithTwoRefsCount, genotypesWithOneRefCount, genotypesWithNoRefsCount);
137137
}
138138

139+
/**
140+
* Do we have (or can we infer) likelihoods necessary for allele frequency calculation?
141+
* Some reblocked and/or DRAGEN GVCFs omit likelihoods for ref blocks, but we can estimate them
142+
* @param g a genotype of unknown call and ploidy
143+
* @return true if we have enough info for AF calculation
144+
*/
139145
public static boolean genotypeIsUsableForAFCalculation(Genotype g) {
140-
return g.hasLikelihoods() || g.hasGQ() || g.getAlleles().stream().anyMatch(a -> a.isCalled() && a.isNonReference() && !a.isSymbolic());
146+
return g.hasLikelihoods() || (g.isHomRef() && g.hasGQ() && 2 == g.getPloidy());
141147
}
142148

143149
/**

0 commit comments

Comments
 (0)