Skip to content

Commit f05a5eb

Browse files
committed
Introduces more efficient variant integration functions depending on whether to include or exclude gaps.
1 parent dc211fd commit f05a5eb

File tree

1 file changed

+163
-26
lines changed

1 file changed

+163
-26
lines changed

src/main/java/util/Bio.java

Lines changed: 163 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -474,30 +474,46 @@ public static String stripGaps(String s) {
474474
* Integrates variants into a reference sequence.
475475
* <p>
476476
* This method modifies the given reference sequence by incorporating the specified variants. Variants are represented as a
477-
* {@link NavigableMap} where the key is the 0-based position in the reference sequence, and the value is the alternative base
478-
* sequence.
479-
* <p>
477+
* {@link NavigableMap} where the key is the 0-based position in the reference sequence, and the value is the alternative base sequence.
480478
* Variants have to be in canonical form, i.e. they must start with a non-gap character that is assumed to be in the coordinate system
481479
* of the reference sequence. If additional characters follow, these have to be all gaps (indicating a deletion) or all non-gaps
482480
* (indicating an insertion).
483481
* <p>
484-
* The method handles deletions by tracking the number of gaps introduced and ensures that the resulting sequence reflects the
485-
* integrated variants. Optionally, gaps can be stripped from the final sequence.
482+
* The method handles deletions by tracking the number of deleted downstream positions. These are either replaced by gap symbols or
483+
* ignored, depending on the value specified for {@code excludeGaps}.
486484
*
487-
* @param reference The original reference sequence as a {@link String}.
488-
* @param variants A {@link NavigableMap} containing the variants to integrate, where the key is the position and the value is the
489-
* alternative base sequence in canonical form.
490-
* @param stripGaps A {@code boolean} indicating whether to remove gaps from the resulting sequence.
485+
* @param reference The original reference sequence as a {@link String}.
486+
* @param variants A {@link NavigableMap} containing the variants to integrate, where the key is the position and the value is the
487+
* alternative base sequence in canonical form.
488+
* @param excludeGaps A {@code boolean} indicating whether to exclude gaps from the resulting sequence.
491489
* @return A {@link String} representing the reference sequence with the integrated variants. If {@code stripGaps} is {@code true}, gaps
492490
* are removed from the resulting sequence.
493491
* @throws IllegalArgumentException If the reference sequence is empty.
494492
* @throws MusialException If an invalid variant is encountered.
495493
*/
496-
public static String integrateVariants(String reference, NavigableMap<Integer, String> variants, boolean stripGaps) throws MusialException {
494+
public static String integrateVariants(String reference, NavigableMap<Integer, String> variants, boolean excludeGaps) throws MusialException {
497495
// Validate input and return early if no processing is needed.
498496
if (reference.isEmpty()) throw new IllegalArgumentException("Specified reference sequence is empty.");
499497
if (variants.isEmpty()) return reference;
500498

499+
// Choose the appropriate integration method based on the excludeGaps flag.
500+
if (excludeGaps) {
501+
return integrateVariantsWithoutGaps(reference, variants);
502+
} else {
503+
return integrateVariantsWithGaps(reference, variants);
504+
}
505+
}
506+
507+
/**
508+
* Integrates variants into a reference sequence {@link String} while preserving gaps.
509+
*
510+
* @param reference The original reference sequence as a {@link String}.
511+
* @param variants A {@link NavigableMap} containing the variants to integrate, where the key is the position and the value is the
512+
* alternative base sequence in canonical form.
513+
* @return A {@link String} representing the reference sequence with the integrated variants, including gaps.
514+
* @throws MusialException If an invalid variant is encountered.
515+
*/
516+
private static String integrateVariantsWithGaps(String reference, NavigableMap<Integer, String> variants) throws MusialException {
501517
// Initialize result builder and deletion counter.
502518
StringBuilder result = new StringBuilder(reference.length());
503519
int deletions = 0;
@@ -530,37 +546,108 @@ public static String integrateVariants(String reference, NavigableMap<Integer, S
530546
}
531547
}
532548

533-
// Return the final sequence, optionally stripping gaps.
534-
return stripGaps ? stripGaps(result.toString()) : result.toString();
549+
return result.toString();
550+
}
551+
552+
/**
553+
* Integrates variants into a reference sequence {@link String} while removing gaps.
554+
*
555+
* @param reference The original reference sequence as a {@link String}.
556+
* @param variants A {@link NavigableMap} containing the variants to integrate, where the key is the position and the value is the
557+
* alternative base sequence in canonical form.
558+
* @return A {@link String} representing the reference sequence with the integrated variants, excluding gaps.
559+
* @throws MusialException If an invalid variant is encountered.
560+
*/
561+
private static String integrateVariantsWithoutGaps(String reference, NavigableMap<Integer, String> variants) throws MusialException {
562+
// Initialize result builder and deletion counter.
563+
StringBuilder result = new StringBuilder(reference.length());
564+
int deletions = 0;
565+
566+
// Process each position in the reference sequence.
567+
for (int i = 0; i < reference.length(); i++) {
568+
// Check if a variant exists at the current position.
569+
if (variants.containsKey(i)) {
570+
String variant = variants.get(i);
571+
// Handle deletions by appending gaps if needed.
572+
if (deletions > 0) {
573+
deletions--;
574+
} else {
575+
// Append the first character of the variant.
576+
result.append(variant.charAt(0));
577+
}
578+
// Handle insertions, deletions, or invalid variants.
579+
if (isInsertion(variant)) {
580+
result.append(variant.substring(1));
581+
} else if (isDeletion(variant)) {
582+
deletions += variant.length() - 1;
583+
} else if (!isSubstitution(variant)) {
584+
throw new MusialException("Invalid variant '%s' at position %d.".formatted(variant, i));
585+
}
586+
} else {
587+
// Append the current character from the reference sequence if no deletions are pending.
588+
if (deletions > 0) {
589+
deletions--;
590+
} else {
591+
result.append(reference.charAt(i));
592+
}
593+
}
594+
}
595+
596+
return result.toString();
535597
}
536598

537599
/**
538600
* Integrates variants into a reference sequence.
539601
* <p>
540602
* This method modifies the given reference sequence by incorporating the specified variants. Variants are represented as a {@link Map}
541-
* where the key is the position in the reference sequence, and the value is the alternative base sequence.
542-
* <p>
543-
* The method handles substitutions, insertions, and deletions: - Substitutions replace the reference character at the position. -
544-
* Insertions add additional characters after the reference character. - Deletions remove characters from the reference sequence.
603+
* where the key is the 1-based position in the reference sequence, and the value is the alternative base sequence. Variants have to be
604+
* in canonical form, i.e. they must start with a non-gap character that is assumed to be in the coordinate system of the reference
605+
* sequence. If additional characters follow, these have to be all gaps (indicating a deletion) or all non-gaps (indicating an
606+
* insertion).
545607
* <p>
546-
* The method also accounts for extensions (e.g., gaps) and ensures that the resulting sequence reflects the integrated variants.
547-
* Optionally, gaps can be stripped from the final sequence.
608+
* In contrast to {@link #integrateVariants(String, NavigableMap, boolean)}, the reference sequence is provided as a
609+
* {@link NavigableMap} where the key is the 1-based position and the value is a {@link ReferenceContext} containing the character and
610+
* the maximal number of inserted bases at the position wrt. a collection of biological samples. This allows to represent gaps that have
611+
* been introduced by insertions in other biological samples. These gaps as well as such induced by deletions can be either preserved or
612+
* removed from the resulting sequence, depending on the value specified for {@code excludeGaps}.
548613
*
549-
* @param reference A {@link NavigableMap} representing the reference sequence, where the key is the position and the value is a
550-
* {@link ReferenceContext} containing the character and extension at that position.
551-
* @param variants A {@link Map} containing the variants to integrate, where the key is the position and the value is the alternative
552-
* base sequence.
553-
* @param stripGaps A {@code boolean} indicating whether to remove gaps from the resulting sequence.
614+
* @param reference A {@link NavigableMap} representing the reference sequence, where the key is the position and the value is a
615+
* {@link ReferenceContext} containing the character and extension at that position.
616+
* @param variants A {@link Map} containing the variants to integrate, where the key is the position and the value is the alternative
617+
* base sequence.
618+
* @param excludeGaps A {@code boolean} indicating whether to remove gaps from the resulting sequence.
554619
* @return A {@link String} representing the reference sequence with the integrated variants. If {@code stripGaps} is {@code true}, gaps
555620
* are removed from the resulting sequence.
556621
* @throws IllegalArgumentException If the reference sequence is empty or if an invalid variant is encountered.
557622
* @throws MusialException If an invalid variant is encountered.
558623
*/
559624
public static String integrateVariants(NavigableMap<Integer, ReferenceContext> reference, Map<Integer, String> variants,
560-
boolean stripGaps) throws MusialException {
625+
boolean excludeGaps) throws MusialException {
561626
// Validate input.
562627
if (reference.isEmpty()) throw new IllegalArgumentException("Specified reference context is empty.");
563628

629+
if (excludeGaps) {
630+
return integrateVariantsWithoutGaps(reference, variants);
631+
} else {
632+
return integrateVariantsWithGaps(reference, variants);
633+
}
634+
}
635+
636+
/**
637+
* Integrates variants into a reference sequence while preserving gaps.
638+
* <p>
639+
* This method modifies the given reference sequence by incorporating the specified variants. Gaps are preserved in the resulting
640+
* sequence. The method handles deletions by tracking the number of gaps introduced and ensures that the resulting sequence reflects the
641+
* integrated variants.
642+
*
643+
* @param reference A {@link NavigableMap} representing the reference sequence, where the key is the position and the value is a
644+
* {@link ReferenceContext} containing the character and extension at that position.
645+
* @param variants A {@link Map} containing the variants to integrate, where the key is the position and the value is the alternative
646+
* base sequence.
647+
* @return A {@link String} representing the reference sequence with the integrated variants, including gaps.
648+
* @throws MusialException If an invalid variant is encountered.
649+
*/
650+
private static String integrateVariantsWithGaps(NavigableMap<Integer, ReferenceContext> reference, Map<Integer, String> variants) throws MusialException {
564651
// Initialize result builder and deletions set.
565652
StringBuilder result = new StringBuilder(reference.size());
566653
Set<Integer> deletions = new HashSet<>();
@@ -605,8 +692,58 @@ public static String integrateVariants(NavigableMap<Integer, ReferenceContext> r
605692
}
606693
}
607694

608-
// Return the final sequence, optionally stripping gaps.
609-
return stripGaps ? stripGaps(result.toString()) : result.toString();
695+
return result.toString();
696+
}
697+
698+
/**
699+
* Integrates variants into a reference sequence while removing gaps.
700+
* <p>
701+
* This method modifies the given reference sequence by incorporating the specified variants. Gaps induced by deletions are not included
702+
* in the resulting sequence. The method handles deletions by tracking the number of positions to skip and ensures that the resulting
703+
* sequence reflects the integrated variants without gaps.
704+
*
705+
* @param reference A {@link NavigableMap} representing the reference sequence, where the key is the position and the value is a
706+
* {@link ReferenceContext} containing the character at that position.
707+
* @param variants A {@link Map} containing the variants to integrate, where the key is the position and the value is the alternative
708+
* base sequence.
709+
* @return A {@link String} representing the reference sequence with the integrated variants, excluding gaps.
710+
* @throws MusialException If an invalid variant is encountered.
711+
*/
712+
private static String integrateVariantsWithoutGaps(NavigableMap<Integer, ReferenceContext> reference, Map<Integer, String> variants) throws MusialException {
713+
// Initialize result builder and deletions set.
714+
StringBuilder result = new StringBuilder(reference.size());
715+
Set<Integer> deletions = new HashSet<>();
716+
717+
// Process each position in the reference sequence.
718+
for (var entry : reference.entrySet()) {
719+
int position = entry.getKey();
720+
char referenceCharacter = entry.getValue().character();
721+
722+
// Check if a variant exists at the current position.
723+
if (variants.containsKey(position)) {
724+
String variant = variants.get(position);
725+
// Handle deletions by appending a gap, if needed.
726+
if (!deletions.remove(position)) {
727+
// Append the first character of the variant.
728+
result.append(variant.charAt(0));
729+
}
730+
// Add deleted positions.
731+
if (isDeletion(variant)) {
732+
deletions.addAll(IntStream.rangeClosed(position + 1, position + (variant.length() - 1)).boxed().toList());
733+
} else if (isInsertion(variant)) {
734+
result.append(variant.substring(1));
735+
} else if (!isSubstitution(variant)) {
736+
throw new MusialException("Invalid variant %s at position %d.".formatted(variant, position));
737+
}
738+
} else {
739+
// Append a gap or the current character from the reference sequence.
740+
if (!deletions.remove(position)) {
741+
result.append(referenceCharacter);
742+
}
743+
}
744+
}
745+
746+
return result.toString();
610747
}
611748

612749
/**

0 commit comments

Comments
 (0)