@@ -474,30 +474,46 @@ public static String stripGaps(String s) {
474474 * Integrates variants into a reference sequence.
475475 * <p>
476476 * This method modifies the given reference sequence by incorporating the specified variants. Variants are represented as a
477- * {@link NavigableMap} where the key is the 0-based position in the reference sequence, and the value is the alternative base
478- * sequence.
479- * <p>
477+ * {@link NavigableMap} where the key is the 0-based position in the reference sequence, and the value is the alternative base sequence.
480478 * Variants have to be in canonical form, i.e. they must start with a non-gap character that is assumed to be in the coordinate system
481479 * of the reference sequence. If additional characters follow, these have to be all gaps (indicating a deletion) or all non-gaps
482480 * (indicating an insertion).
483481 * <p>
484- * The method handles deletions by tracking the number of gaps introduced and ensures that the resulting sequence reflects the
485- * integrated variants. Optionally, gaps can be stripped from the final sequence .
482+ * The method handles deletions by tracking the number of deleted downstream positions. These are either replaced by gap symbols or
483+ * ignored, depending on the value specified for {@code excludeGaps} .
486484 *
487- * @param reference The original reference sequence as a {@link String}.
488- * @param variants A {@link NavigableMap} containing the variants to integrate, where the key is the position and the value is the
489- * alternative base sequence in canonical form.
490- * @param stripGaps A {@code boolean} indicating whether to remove gaps from the resulting sequence.
485+ * @param reference The original reference sequence as a {@link String}.
486+ * @param variants A {@link NavigableMap} containing the variants to integrate, where the key is the position and the value is the
487+ * alternative base sequence in canonical form.
488+ * @param excludeGaps A {@code boolean} indicating whether to exclude gaps from the resulting sequence.
491489 * @return A {@link String} representing the reference sequence with the integrated variants. If {@code stripGaps} is {@code true}, gaps
492490 * are removed from the resulting sequence.
493491 * @throws IllegalArgumentException If the reference sequence is empty.
494492 * @throws MusialException If an invalid variant is encountered.
495493 */
496- public static String integrateVariants (String reference , NavigableMap <Integer , String > variants , boolean stripGaps ) throws MusialException {
494+ public static String integrateVariants (String reference , NavigableMap <Integer , String > variants , boolean excludeGaps ) throws MusialException {
497495 // Validate input and return early if no processing is needed.
498496 if (reference .isEmpty ()) throw new IllegalArgumentException ("Specified reference sequence is empty." );
499497 if (variants .isEmpty ()) return reference ;
500498
499+ // Choose the appropriate integration method based on the excludeGaps flag.
500+ if (excludeGaps ) {
501+ return integrateVariantsWithoutGaps (reference , variants );
502+ } else {
503+ return integrateVariantsWithGaps (reference , variants );
504+ }
505+ }
506+
507+ /**
508+ * Integrates variants into a reference sequence {@link String} while preserving gaps.
509+ *
510+ * @param reference The original reference sequence as a {@link String}.
511+ * @param variants A {@link NavigableMap} containing the variants to integrate, where the key is the position and the value is the
512+ * alternative base sequence in canonical form.
513+ * @return A {@link String} representing the reference sequence with the integrated variants, including gaps.
514+ * @throws MusialException If an invalid variant is encountered.
515+ */
516+ private static String integrateVariantsWithGaps (String reference , NavigableMap <Integer , String > variants ) throws MusialException {
501517 // Initialize result builder and deletion counter.
502518 StringBuilder result = new StringBuilder (reference .length ());
503519 int deletions = 0 ;
@@ -530,37 +546,108 @@ public static String integrateVariants(String reference, NavigableMap<Integer, S
530546 }
531547 }
532548
533- // Return the final sequence, optionally stripping gaps.
534- return stripGaps ? stripGaps (result .toString ()) : result .toString ();
549+ return result .toString ();
550+ }
551+
552+ /**
553+ * Integrates variants into a reference sequence {@link String} while removing gaps.
554+ *
555+ * @param reference The original reference sequence as a {@link String}.
556+ * @param variants A {@link NavigableMap} containing the variants to integrate, where the key is the position and the value is the
557+ * alternative base sequence in canonical form.
558+ * @return A {@link String} representing the reference sequence with the integrated variants, excluding gaps.
559+ * @throws MusialException If an invalid variant is encountered.
560+ */
561+ private static String integrateVariantsWithoutGaps (String reference , NavigableMap <Integer , String > variants ) throws MusialException {
562+ // Initialize result builder and deletion counter.
563+ StringBuilder result = new StringBuilder (reference .length ());
564+ int deletions = 0 ;
565+
566+ // Process each position in the reference sequence.
567+ for (int i = 0 ; i < reference .length (); i ++) {
568+ // Check if a variant exists at the current position.
569+ if (variants .containsKey (i )) {
570+ String variant = variants .get (i );
571+ // Handle deletions by appending gaps if needed.
572+ if (deletions > 0 ) {
573+ deletions --;
574+ } else {
575+ // Append the first character of the variant.
576+ result .append (variant .charAt (0 ));
577+ }
578+ // Handle insertions, deletions, or invalid variants.
579+ if (isInsertion (variant )) {
580+ result .append (variant .substring (1 ));
581+ } else if (isDeletion (variant )) {
582+ deletions += variant .length () - 1 ;
583+ } else if (!isSubstitution (variant )) {
584+ throw new MusialException ("Invalid variant '%s' at position %d." .formatted (variant , i ));
585+ }
586+ } else {
587+ // Append the current character from the reference sequence if no deletions are pending.
588+ if (deletions > 0 ) {
589+ deletions --;
590+ } else {
591+ result .append (reference .charAt (i ));
592+ }
593+ }
594+ }
595+
596+ return result .toString ();
535597 }
536598
537599 /**
538600 * Integrates variants into a reference sequence.
539601 * <p>
540602 * This method modifies the given reference sequence by incorporating the specified variants. Variants are represented as a {@link Map}
541- * where the key is the position in the reference sequence, and the value is the alternative base sequence.
542- * <p>
543- * The method handles substitutions, insertions, and deletions: - Substitutions replace the reference character at the position. -
544- * Insertions add additional characters after the reference character. - Deletions remove characters from the reference sequence .
603+ * where the key is the 1-based position in the reference sequence, and the value is the alternative base sequence. Variants have to be
604+ * in canonical form, i.e. they must start with a non-gap character that is assumed to be in the coordinate system of the reference
605+ * sequence. If additional characters follow, these have to be all gaps (indicating a deletion) or all non-gaps (indicating an
606+ * insertion) .
545607 * <p>
546- * The method also accounts for extensions (e.g., gaps) and ensures that the resulting sequence reflects the integrated variants.
547- * Optionally, gaps can be stripped from the final sequence.
608+ * In contrast to {@link #integrateVariants(String, NavigableMap, boolean)}, the reference sequence is provided as a
609+ * {@link NavigableMap} where the key is the 1-based position and the value is a {@link ReferenceContext} containing the character and
610+ * the maximal number of inserted bases at the position wrt. a collection of biological samples. This allows to represent gaps that have
611+ * been introduced by insertions in other biological samples. These gaps as well as such induced by deletions can be either preserved or
612+ * removed from the resulting sequence, depending on the value specified for {@code excludeGaps}.
548613 *
549- * @param reference A {@link NavigableMap} representing the reference sequence, where the key is the position and the value is a
550- * {@link ReferenceContext} containing the character and extension at that position.
551- * @param variants A {@link Map} containing the variants to integrate, where the key is the position and the value is the alternative
552- * base sequence.
553- * @param stripGaps A {@code boolean} indicating whether to remove gaps from the resulting sequence.
614+ * @param reference A {@link NavigableMap} representing the reference sequence, where the key is the position and the value is a
615+ * {@link ReferenceContext} containing the character and extension at that position.
616+ * @param variants A {@link Map} containing the variants to integrate, where the key is the position and the value is the alternative
617+ * base sequence.
618+ * @param excludeGaps A {@code boolean} indicating whether to remove gaps from the resulting sequence.
554619 * @return A {@link String} representing the reference sequence with the integrated variants. If {@code stripGaps} is {@code true}, gaps
555620 * are removed from the resulting sequence.
556621 * @throws IllegalArgumentException If the reference sequence is empty or if an invalid variant is encountered.
557622 * @throws MusialException If an invalid variant is encountered.
558623 */
559624 public static String integrateVariants (NavigableMap <Integer , ReferenceContext > reference , Map <Integer , String > variants ,
560- boolean stripGaps ) throws MusialException {
625+ boolean excludeGaps ) throws MusialException {
561626 // Validate input.
562627 if (reference .isEmpty ()) throw new IllegalArgumentException ("Specified reference context is empty." );
563628
629+ if (excludeGaps ) {
630+ return integrateVariantsWithoutGaps (reference , variants );
631+ } else {
632+ return integrateVariantsWithGaps (reference , variants );
633+ }
634+ }
635+
636+ /**
637+ * Integrates variants into a reference sequence while preserving gaps.
638+ * <p>
639+ * This method modifies the given reference sequence by incorporating the specified variants. Gaps are preserved in the resulting
640+ * sequence. The method handles deletions by tracking the number of gaps introduced and ensures that the resulting sequence reflects the
641+ * integrated variants.
642+ *
643+ * @param reference A {@link NavigableMap} representing the reference sequence, where the key is the position and the value is a
644+ * {@link ReferenceContext} containing the character and extension at that position.
645+ * @param variants A {@link Map} containing the variants to integrate, where the key is the position and the value is the alternative
646+ * base sequence.
647+ * @return A {@link String} representing the reference sequence with the integrated variants, including gaps.
648+ * @throws MusialException If an invalid variant is encountered.
649+ */
650+ private static String integrateVariantsWithGaps (NavigableMap <Integer , ReferenceContext > reference , Map <Integer , String > variants ) throws MusialException {
564651 // Initialize result builder and deletions set.
565652 StringBuilder result = new StringBuilder (reference .size ());
566653 Set <Integer > deletions = new HashSet <>();
@@ -605,8 +692,58 @@ public static String integrateVariants(NavigableMap<Integer, ReferenceContext> r
605692 }
606693 }
607694
608- // Return the final sequence, optionally stripping gaps.
609- return stripGaps ? stripGaps (result .toString ()) : result .toString ();
695+ return result .toString ();
696+ }
697+
698+ /**
699+ * Integrates variants into a reference sequence while removing gaps.
700+ * <p>
701+ * This method modifies the given reference sequence by incorporating the specified variants. Gaps induced by deletions are not included
702+ * in the resulting sequence. The method handles deletions by tracking the number of positions to skip and ensures that the resulting
703+ * sequence reflects the integrated variants without gaps.
704+ *
705+ * @param reference A {@link NavigableMap} representing the reference sequence, where the key is the position and the value is a
706+ * {@link ReferenceContext} containing the character at that position.
707+ * @param variants A {@link Map} containing the variants to integrate, where the key is the position and the value is the alternative
708+ * base sequence.
709+ * @return A {@link String} representing the reference sequence with the integrated variants, excluding gaps.
710+ * @throws MusialException If an invalid variant is encountered.
711+ */
712+ private static String integrateVariantsWithoutGaps (NavigableMap <Integer , ReferenceContext > reference , Map <Integer , String > variants ) throws MusialException {
713+ // Initialize result builder and deletions set.
714+ StringBuilder result = new StringBuilder (reference .size ());
715+ Set <Integer > deletions = new HashSet <>();
716+
717+ // Process each position in the reference sequence.
718+ for (var entry : reference .entrySet ()) {
719+ int position = entry .getKey ();
720+ char referenceCharacter = entry .getValue ().character ();
721+
722+ // Check if a variant exists at the current position.
723+ if (variants .containsKey (position )) {
724+ String variant = variants .get (position );
725+ // Handle deletions by appending a gap, if needed.
726+ if (!deletions .remove (position )) {
727+ // Append the first character of the variant.
728+ result .append (variant .charAt (0 ));
729+ }
730+ // Add deleted positions.
731+ if (isDeletion (variant )) {
732+ deletions .addAll (IntStream .rangeClosed (position + 1 , position + (variant .length () - 1 )).boxed ().toList ());
733+ } else if (isInsertion (variant )) {
734+ result .append (variant .substring (1 ));
735+ } else if (!isSubstitution (variant )) {
736+ throw new MusialException ("Invalid variant %s at position %d." .formatted (variant , position ));
737+ }
738+ } else {
739+ // Append a gap or the current character from the reference sequence.
740+ if (!deletions .remove (position )) {
741+ result .append (referenceCharacter );
742+ }
743+ }
744+ }
745+
746+ return result .toString ();
610747 }
611748
612749 /**
0 commit comments