Have ref seq always be equal to sequence length

benjeffery · benjeffery · commit fe7724c3bbd4 · 2025-11-11T16:12:06.000Z
diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py
@@ -1697,7 +1697,7 @@ def test_alignments_left_right_subinterval(self):
         ts = self.ts()
         # Use a custom reference and a subinterval [2, 8)
         ref = "A" * 10
-        got = list(ts.alignments(reference_sequence=ref[2:8], left=2, right=8))
+        got = list(ts.alignments(reference_sequence=ref, left=2, right=8))
         assert got == ["GAAAAA", "AAAAAA", "AAAAAA", "NNNNNN"]
 
     def test_fasta_reference_sequence(self):
@@ -1851,17 +1851,19 @@ def build_ts(self):
     def test_whole_window_missing_at_ends(self):
         ts = self.build_ts()
         ref = "0123456789"
-        # Node is isolated outside [3,7): expect missing there; inside use ref, with site overlay at 5
+        # Node is isolated outside [3,7): expect missing there; inside use ref,
+        # with site overlay at 5
         got = list(ts.alignments(samples=[1], reference_sequence=ref))
         assert got == ["NNN34G6NNN"]
 
     def test_subwindow(self):
         ts = self.build_ts()
         ref = "0123456789"
         # Request [2,8): expect missing at 2 and 7, ref inside, site overlay at 5
-        got = list(ts.alignments(samples=[1], reference_sequence=ref[2:8], left=2, right=8))
+        got = list(ts.alignments(samples=[1], reference_sequence=ref, left=2, right=8))
         assert got == ["N34G6N"]
 
+
 class TestMultiRootExample:
     # 1.00┊  4   5  ┊
     #     ┊ ┏┻┓ ┏┻┓ ┊
@@ -2234,17 +2236,17 @@ def test_reference_length_mismatch(self, ref_length):
         tables = tskit.TableCollection(10)
         tables.reference_sequence.data = "A" * ref_length
         ts = tables.tree_sequence()
-        if ref_length <= tables.sequence_length:
-            with pytest.raises(ValueError, match="shorter than"):
-                list(ts.alignments())
-        else:
-            # Longer reference sequences are allowed
+        with pytest.raises(
+            ValueError, match="must be equal to the tree sequence length"
+        ):
             list(ts.alignments())
 
     @pytest.mark.parametrize("ref", ["", "xy"])
     def test_reference_sequence_length_mismatch(self, ref):
         ts = self.simplest_ts()
-        with pytest.raises(ValueError, match="shorter than"):
+        with pytest.raises(
+            ValueError, match="must be equal to the tree sequence length"
+        ):
             list(ts.alignments(reference_sequence=ref))
 
     @pytest.mark.parametrize("ref", ["À", "┃", "α"])
@@ -2308,7 +2310,9 @@ def test_bad_restricted(self):
         tables = tskit.TableCollection(10)
         tables.reference_sequence.data = "A" * 7
         ts = tables.tree_sequence()
-        with pytest.raises(ValueError, match="sequence ends before"):
+        with pytest.raises(
+            ValueError, match="must be equal to the tree sequence length"
+        ):
             list(ts.alignments(right=8))
 
     def test_no_samples_default(self):
@@ -2342,16 +2346,20 @@ def test_reference_sequence_too_short_with_interval(self):
         tables = tskit.TableCollection(10)
         tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
         ts = tables.tree_sequence()
-        with pytest.raises(ValueError, match="ends before the requested stop position"):
-            list(ts.alignments(reference_sequence="A" * 5, left=2, right=8))  # L=6
+        with pytest.raises(
+            ValueError, match="must be equal to the tree sequence length"
+        ):
+            list(ts.alignments(reference_sequence="A" * 5, left=2, right=8))
 
-    def test_reference_sequence_too_long_with_interval(self):
-        # Explicit ref longer than [left,right) span should also error
+    def test_reference_sequence_length_must_match_sequence(self):
+        # Explicit ref length must match full sequence length
         tables = tskit.TableCollection(10)
         tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
         ts = tables.tree_sequence()
-        with pytest.raises(ValueError, match="ends before the requested stop position"):
-            list(ts.alignments(reference_sequence="A" * 7, left=2, right=8))  # L=6
+        with pytest.raises(
+            ValueError, match="must be equal to the tree sequence length"
+        ):
+            list(ts.alignments(reference_sequence="A" * 7, left=2, right=8))
 
 
 class TestAlignmentExamples:
@@ -2412,14 +2420,9 @@ def _reference_alignments(
         else:
             reference_sequence = missing_data_character * L
     if len(reference_sequence) != L:
-        if interval.right == int(ts.sequence_length):
-            raise ValueError(
-                "The reference sequence is shorter than the tree sequence length"
-            )
-        else:
-            raise ValueError(
-                "The reference sequence ends before the requested stop position"
-            )
+        raise ValueError(
+            "The reference sequence must be equal to the tree sequence length"
+        )
     ref_array = np.frombuffer(reference_sequence.encode("ascii"), dtype=np.int8)
     H, (first_site_id, last_site_id) = ts._haplotypes_array(
         interval=interval,
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
@@ -5685,21 +5685,25 @@ def alignments(
            single byte characters, (i.e., variants must be single nucleotide
            polymorphisms, or SNPs).
 
-        Missing data handling:
-        - If ``isolated_as_missing=True`` (default), nodes that are isolated in a
-          tree (no parent and no children) are rendered as the missing character
-          across for all bases at that tree's interval except for sites with a
-          mutation directly above the node.
-        - If ``isolated_as_missing=False``, no missing-overlay is applied. At sites,
-          genotypes are decoded as usual; at non-sites, characters come from the
+        Missing data handling
+
+        - If ``isolated_as_missing=True`` (default), nodes that are isolated
+          (no parent and no children) are rendered as the missing character across
+          each tree interval. At site positions, the per-site allele overrides the
+          missing character; if a genotype is missing (``-1``), the missing
+          character is retained.
+        - If ``isolated_as_missing=False``, no missing overlay is applied. At sites,
+          genotypes are decoded as usual; at non-sites, bases come from the
           reference sequence.
 
         See also the :meth:`.variants` iterator for site-centric access
         to sample genotypes and :meth:`.haplotypes` for access to sample sequences
         at just the sites in the tree sequence.
 
         :param str reference_sequence: The reference sequence to fill in
-            gaps between sites in the alignments.
+            gaps between sites in the alignments. If provided, it must be a
+            string of length equal to :attr:`.sequence_length`; the sequence is
+            sliced internally to the requested ``[left, right)`` interval.
         :param str missing_data_character: A single ascii character that will
             be used to represent missing data.
             If any normal allele contains this character, an error is raised.
@@ -5735,32 +5739,29 @@ def alignments(
 
         L = interval.span
         a = np.empty(L, dtype=np.int8)
-        if reference_sequence is None:
-            if self.has_reference_sequence():
-                # This may be inefficient - see #1989. However, since we're
-                # n copies of the reference sequence anyway, this is a relatively
-                # minor tweak. We may also want to recode the below not to use direct
-                # access to the .data attribute, e.g. if we allow reference sequences
-                # to start at non-zero positions
-                reference_sequence = self.reference_sequence.data[
-                    interval.left : interval.right
-                ]
-            else:
-                reference_sequence = missing_data_character * L
-
-        if len(reference_sequence) != L:
-            if interval.right == int(self.sequence_length):
-                raise ValueError(
-                    "The reference sequence is shorter than the tree sequence length"
-                )
-            else:
+        full_ref = None
+        if reference_sequence is not None:
+            full_ref = reference_sequence
+        elif self.has_reference_sequence():
+            # This may be inefficient - see #1989. However, since we're
+            # n copies of the reference sequence anyway, this is a relatively
+            # minor tweak. We may also want to recode the below not to use direct
+            # access to the .data attribute, e.g. if we allow reference sequences
+            # to start at non-zero positions
+            full_ref = self.reference_sequence.data
+
+        if full_ref is None:
+            ref_slice = missing_data_character * L
+        else:
+            if len(full_ref) != int(self.sequence_length):
                 raise ValueError(
-                    "The reference sequence ends before the requested stop position"
+                    "The reference sequence must be equal to the tree sequence length"
                 )
+            ref_slice = full_ref[interval.left : interval.right]
 
-        # TODO Replace this as simple-as-possible readable Python version with C
-        # Reusable reference buffer
-        ref_bytes = reference_sequence.encode("ascii")
+        # TODO Replace this readable Python version with a C backend
+        # Reusable reference buffer for this interval
+        ref_bytes = ref_slice.encode("ascii")
         ref_array = np.frombuffer(ref_bytes, dtype=np.int8)
 
         H, (first_site_id, last_site_id) = self._haplotypes_array(