Skip to content

Commit c343634

Browse files
committed
Add isolated_as_missing to ts.alignements
1 parent e66da01 commit c343634

File tree

3 files changed

+36
-3
lines changed

3 files changed

+36
-3
lines changed

python/CHANGELOG.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
----------------------
2-
[1.0.0b1] - 2025-09-24
2+
[1.0.0b2] - 2025-XX-XX
33
----------------------
44

55
**Breaking Changes**
@@ -21,6 +21,10 @@
2121
- ``draw_svg()`` methods now associate tree branches with edge IDs
2222
(:user:`hyanwong`, :pr:`3193`, :issue:`557`)
2323

24+
- ``TreeSequence.alignments`` now accepts ``isolated_as_missing=False`` so that
25+
alignments can be emitted for non-sample nodes (e.g., internal ARG nodes) when
26+
missing data are imputed to the ancestral state. (:user:`benjeffery`, :issue:`3293`)
27+
2428
- ``draw_svg()`` methods now allow the y-axis to be placed on the right-hand side
2529
using ``y_axis="right"`` (:user:`hyanwong`, :pr:`3201`)
2630

python/tests/test_genotypes.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,6 +1286,22 @@ def test_non_sample_samples(self):
12861286
with pytest.raises(tskit.LibraryError, match="MUST_IMPUTE_NON_SAMPLES"):
12871287
list(ts.alignments(samples=[4]))
12881288

1289+
def test_internal_nodes_with_imputation(self):
1290+
ts = self.ts()
1291+
internal = [u for u in range(ts.num_nodes) if u not in set(ts.samples())]
1292+
seqs = list(ts.alignments(samples=internal, isolated_as_missing=False))
1293+
assert seqs == ["NNANNNNNNC", "NNANNNNNNT"]
1294+
1295+
def test_internal_and_sample_nodes_with_imputation(self):
1296+
ts = self.ts()
1297+
samples = ts.samples()
1298+
internal = next(u for u in range(ts.num_nodes) if u not in set(samples))
1299+
seqs = list(
1300+
ts.alignments(samples=[internal, samples[0]], isolated_as_missing=False)
1301+
)
1302+
assert seqs[0] == "NNANNNNNNC"
1303+
assert seqs[1] == "NNGNNNNNNT"
1304+
12891305
def test_alignments_missing_data_char(self):
12901306
A = list(self.ts().alignments(missing_data_character="x"))
12911307
assert A[0] == "xxGxxxxxxT"

python/tskit/trees.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5596,6 +5596,7 @@ def alignments(
55965596
samples=None,
55975597
left=None,
55985598
right=None,
5599+
isolated_as_missing=None,
55995600
):
56005601
"""
56015602
Returns an iterator over the full sequence alignments for the defined samples
@@ -5660,7 +5661,9 @@ def alignments(
56605661
currently supported by this method and it will raise a ValueError
56615662
if called on tree sequences containing isolated samples.
56625663
See https://github.com/tskit-dev/tskit/issues/1896 for more
5663-
information.
5664+
information. If you wish to include non-sample nodes (e.g., internal
5665+
ARG nodes) in the output, set ``isolated_as_missing=False`` to opt out
5666+
of this guard and impute missing data as the ancestral state.
56645667
56655668
See also the :meth:`.variants` iterator for site-centric access
56665669
to sample genotypes and :meth:`.haplotypes` for access to sample sequences
@@ -5679,6 +5682,11 @@ def alignments(
56795682
(default) alignments start at 0.
56805683
:param int right: Alignments will stop before this genomic position. If ``None``
56815684
(default) alignments will continue until the end of the tree sequence.
5685+
:param bool isolated_as_missing: If True (default), isolated samples without
5686+
mutations are treated as missing data and this method raises an error
5687+
when any are detected. If False, missing data are imputed with the
5688+
ancestral state, which also permits alignments to be generated for
5689+
non-sample nodes such as internal ARG nodes.
56825690
:return: An iterator over the alignment strings for specified samples in
56835691
this tree sequence, in the order given in ``samples``.
56845692
:rtype: collections.abc.Iterable
@@ -5693,6 +5701,8 @@ def alignments(
56935701
missing_data_character = (
56945702
"N" if missing_data_character is None else missing_data_character
56955703
)
5704+
if isolated_as_missing is None:
5705+
isolated_as_missing = True
56965706

56975707
L = interval.span
56985708
a = np.empty(L, dtype=np.int8)
@@ -5730,7 +5740,9 @@ def alignments(
57305740
# incorrectly if have a sample isolated over the region (a, b],
57315741
# and if we have sites at each position from a to b, and at
57325742
# each site there is a mutation over the isolated sample.
5733-
if any(tree._has_isolated_samples() for tree in self.trees()):
5743+
if isolated_as_missing and any(
5744+
tree._has_isolated_samples() for tree in self.trees()
5745+
):
57345746
raise ValueError(
57355747
"Missing data not currently supported in alignments; see "
57365748
"https://github.com/tskit-dev/tskit/issues/1896 for details."
@@ -5741,6 +5753,7 @@ def alignments(
57415753
interval=interval,
57425754
missing_data_character=missing_data_character,
57435755
samples=samples,
5756+
isolated_as_missing=isolated_as_missing,
57445757
)
57455758
site_pos = self.sites_position.astype(np.int64)[
57465759
first_site_id : last_site_id + 1

0 commit comments

Comments
 (0)