diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index 8226208371..0cdd0576cd 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -1,3 +1,12 @@ +---------- +Unreleased +---------- + +- ``tsk_variant_init`` and associated variant decoding methods now + fully support TSK_ISOLATED_NOT_MISSING not being set for internal nodes. + (:user:`benjeffery`, :pr:`3313`) + + -------------------- [1.2.0] - 2025-09-24 -------------------- diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index fb15196ac8..d54d487c48 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -1,3 +1,20 @@ +---------- +Unreleased +---------- + +**Features** + +- ``TreeSequence.variants``, ``.genotype_matrix``, ``.haplotypes``, and ``.alignments`` methods + now fully support ``isolated_as_missing`` behaviour with internal nodes. + (:user:`benjeffery`, :pr:`3313`, :pr:`3317`, :issue:`1896`) + +**Breaking Changes** + +- The ``reference_sequence`` argument to ``TreeSequence.alignments`` is now required + to be the same length as the tree sequence. Previously it was required to be the length + of the requested interval. + (:user:`benjeffery`, :pr:`3317`) + ---------------------- [1.0.0b3] - 2025-10-15 ---------------------- diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index 175b24735f..672fbe2088 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -1283,8 +1283,18 @@ def test_empty_samples(self): def test_non_sample_samples(self): ts = self.ts() - with pytest.raises(tskit.LibraryError, match="MUST_IMPUTE_NON_SAMPLES"): - list(ts.alignments(samples=[4])) + assert list(ts.alignments(samples=[4])) == ["NNANNNNNNT"] + assert list(ts.alignments(samples=[3])) == ["NNANNNNNNC"] + + def test_alignments_samples_order_preserved(self): + ts = self.ts() + # Custom non-default, unique order + rows = list(ts.alignments(samples=[2, 0, 1])) + assert rows == [ + "NNANNNNNNC", # sample 2 + "NNGNNNNNNT", # sample 0 + "NNANNNNNNC", # sample 1 + ] def test_variants_internal_nodes(self): ts = self.ts() @@ -1501,6 +1511,15 @@ def test_alignments_missing_data_char(self): assert A[1] == "ACATACGTAC" assert A[2] == "ACATACGTAC" + def test_alignments_restricted_embedded_reference(self): + ts = self.ts() + # Use embedded reference ("ACGTACGTAC"). Slice [1,9) -> "CGTACGTA". + A = list(ts.alignments(left=1, right=9)) + # Site at pos 2 overlays: sample 0 gets 'G' (derived), others 'A' (ancestral). + assert A[0] == "CGTACGTA" + assert A[1] == "CATACGTA" + assert A[2] == "CATACGTA" + def test_alignments_reference_sequence(self): ref = "0123456789" A = list(self.ts().alignments(reference_sequence=ref)) @@ -1625,7 +1644,6 @@ def test_genotypes(self): Gp = [[1, 0, 0, -1], [0, 1, 1, -1]] np.testing.assert_array_equal(G, Gp) - @pytest.mark.skip("Missing data in alignments: #1896") def test_alignments_default(self): A = list(self.ts().alignments()) assert A[0] == "NNGNNNNNNT" @@ -1633,12 +1651,6 @@ def test_alignments_default(self): assert A[2] == "NNANNNNNNC" assert A[3] == "NNNNNNNNNN" - def test_alignments_fails(self): - # https://github.com/tskit-dev/tskit/issues/1896 - with pytest.raises(ValueError, match="1896"): - next(self.ts().alignments()) - - @pytest.mark.skip("Missing data in alignments: #1896") def test_alignments_impute_missing(self): ref = "N" * 10 A = list( @@ -1649,7 +1661,6 @@ def test_alignments_impute_missing(self): assert A[2] == "NNANNNNNNC" assert A[3] == "NNANNNNNNT" - @pytest.mark.skip("Missing data in alignments: #1896") def test_alignments_missing_char(self): A = list(self.ts().alignments(missing_data_character="z")) assert A[0] == "zzGzzzzzzT" @@ -1657,15 +1668,13 @@ def test_alignments_missing_char(self): assert A[2] == "zzAzzzzzzC" assert A[3] == "zzzzzzzzzz" - @pytest.mark.skip("Missing data in alignments: #1896") def test_alignments_missing_char_ref(self): - A = list(self.ts().alignments(missing_data_character="z")) + A = list(self.ts().alignments()) assert A[0] == "NNGNNNNNNT" assert A[1] == "NNANNNNNNC" assert A[2] == "NNANNNNNNC" - assert A[3] == "zzzzzzzzzz" + assert A[3] == "NNNNNNNNNN" - @pytest.mark.skip("Missing data in alignments: #1896") def test_alignments_reference_sequence(self): ref = "0123456789" A = list(self.ts().alignments(reference_sequence=ref)) @@ -1674,7 +1683,6 @@ def test_alignments_reference_sequence(self): assert A[2] == "01A345678C" assert A[3] == "NNNNNNNNNN" - @pytest.mark.skip("Missing data in alignments: #1896") def test_alignments_reference_sequence_missing_data_char(self): ref = "0123456789" A = list( @@ -1685,7 +1693,13 @@ def test_alignments_reference_sequence_missing_data_char(self): assert A[2] == "01A345678C" assert A[3] == "QQQQQQQQQQ" - @pytest.mark.skip("Missing data in alignments: #1896") + def test_alignments_left_right_subinterval(self): + ts = self.ts() + # Use a custom reference and a subinterval [2, 8) + ref = "A" * 10 + got = list(ts.alignments(reference_sequence=ref, left=2, right=8)) + assert got == ["GAAAAA", "AAAAAA", "AAAAAA", "NNNNNN"] + def test_fasta_reference_sequence(self): ref = "0123456789" expected = textwrap.dedent( @@ -1702,7 +1716,6 @@ def test_fasta_reference_sequence(self): ) assert expected == self.ts().as_fasta(reference_sequence=ref) - @pytest.mark.skip("Missing data in alignments: #1896") def test_fasta_reference_sequence_missing_data_char(self): ref = "0123456789" expected = textwrap.dedent( @@ -1721,7 +1734,6 @@ def test_fasta_reference_sequence_missing_data_char(self): reference_sequence=ref, missing_data_character="Q" ) - @pytest.mark.skip("Missing data in alignments: #1896") def test_fasta_impute_missing(self): ref = "N" * 10 expected = textwrap.dedent( @@ -1743,7 +1755,6 @@ def test_fasta_impute_missing(self): # Note: the nexus tree output isn't compatible with our representation of # missing data as trees with isolated roots (newick parsers won't accept # this as valid input), so we set include_trees=False for these examples. - @pytest.mark.skip("Missing data in alignments: #1896") def test_nexus_reference_sequence(self): ref = "0123456789" expected = textwrap.dedent( @@ -1769,7 +1780,6 @@ def test_nexus_reference_sequence(self): reference_sequence=ref, include_trees=False ) - @pytest.mark.skip("Missing data in alignments: #1896") def test_nexus_reference_sequence_missing_data_char(self): ref = "0123456789" expected = textwrap.dedent( @@ -1797,7 +1807,6 @@ def test_nexus_reference_sequence_missing_data_char(self): include_trees=False, ) - @pytest.mark.skip("Missing data in alignments: #1896") def test_nexus_impute_missing(self): ref = "0123456789" expected = textwrap.dedent( @@ -1826,6 +1835,35 @@ def test_nexus_impute_missing(self): ) +class TestAlignmentsPartialIsolation: + def build_ts(self): + # sequence length 10, sample node covers only [3,7) + tables = tskit.TableCollection(10) + parent = tables.nodes.add_row(time=1) + child = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + tables.edges.add_row(left=3, right=7, parent=parent, child=child) + # Add a site inside the covered region with a mutation on the child + site_id = tables.sites.add_row(position=5, ancestral_state="A") + tables.mutations.add_row(site=site_id, node=child, derived_state="G") + tables.sort() + return tables.tree_sequence() + + def test_whole_window_missing_at_ends(self): + ts = self.build_ts() + ref = "0123456789" + # Node is isolated outside [3,7): expect missing there; inside use ref, + # with site overlay at 5 + got = list(ts.alignments(samples=[1], reference_sequence=ref)) + assert got == ["NNN34G6NNN"] + + def test_subwindow(self): + ts = self.build_ts() + ref = "0123456789" + # Request [2,8): expect missing at 2 and 7, ref inside, site overlay at 5 + got = list(ts.alignments(samples=[1], reference_sequence=ref, left=2, right=8)) + assert got == ["N34G6N"] + + class TestMultiRootExample: # 1.00┊ 4 5 ┊ # ┊ ┏┻┓ ┏┻┓ ┊ @@ -1875,6 +1913,16 @@ def test_alignments_N_ref(self): assert A[2] == "NNGNNNNNAN" assert A[3] == "NNGNNNNNAN" + def test_alignments_multichar_allele_raises(self): + tables = self.ts().dump_tables() + tables.sites.clear() + tables.mutations.clear() + tables.sites.add_row(2, ancestral_state="AC") + tables.sort() + ts_bad = tables.tree_sequence() + with pytest.raises(TypeError): + list(ts_bad.alignments()) + def test_fasta_reference_sequence(self): ref = "0123456789" expected = textwrap.dedent( @@ -2188,17 +2236,17 @@ def test_reference_length_mismatch(self, ref_length): tables = tskit.TableCollection(10) tables.reference_sequence.data = "A" * ref_length ts = tables.tree_sequence() - if ref_length <= tables.sequence_length: - with pytest.raises(ValueError, match="shorter than"): - list(ts.alignments()) - else: - # Longer reference sequences are allowed + with pytest.raises( + ValueError, match="must be equal to the tree sequence length" + ): list(ts.alignments()) @pytest.mark.parametrize("ref", ["", "xy"]) def test_reference_sequence_length_mismatch(self, ref): ts = self.simplest_ts() - with pytest.raises(ValueError, match="shorter than"): + with pytest.raises( + ValueError, match="must be equal to the tree sequence length" + ): list(ts.alignments(reference_sequence=ref)) @pytest.mark.parametrize("ref", ["À", "┃", "α"]) @@ -2222,6 +2270,32 @@ def test_non_ascii_missing_data_char(self, missing_data_char): with pytest.raises(UnicodeEncodeError): list(ts.alignments(missing_data_character=missing_data_char)) + def test_multichar_missing_data_char(self): + ts = self.simplest_ts() + # Multi-character missing symbol is invalid + with pytest.raises(TypeError): + list(ts.alignments(reference_sequence="A", missing_data_character="NN")) + + def test_missing_char_clashes_with_allele(self): + # If the missing character equals an allele present at a site, error + tables = tskit.TableCollection(3) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + tables.sites.add_row(1, ancestral_state="A") + ts = tables.tree_sequence() + with pytest.raises(ValueError, match="clashes with an existing allele"): + list(ts.alignments(missing_data_character="A")) + + def test_invalid_negative_node(self): + ts = self.simplest_ts() + with pytest.raises(tskit.LibraryError, match="out of bounds"): + list(ts.alignments(samples=[-1])) + + def test_invalid_out_of_bounds_node(self): + ts = self.simplest_ts() + with pytest.raises(tskit.LibraryError, match="out of bounds"): + list(ts.alignments(samples=[ts.num_nodes])) + def test_bad_left(self): ts = tskit.TableCollection(10).tree_sequence() with pytest.raises(ValueError, match="integer"): @@ -2236,48 +2310,183 @@ def test_bad_restricted(self): tables = tskit.TableCollection(10) tables.reference_sequence.data = "A" * 7 ts = tables.tree_sequence() - with pytest.raises(ValueError, match="sequence ends before"): + with pytest.raises( + ValueError, match="must be equal to the tree sequence length" + ): list(ts.alignments(right=8)) + def test_no_samples_default(self): + # No sample nodes: default alignments iterator is empty + tables = tskit.TableCollection(5) + # Add a non-sample node only + tables.nodes.add_row(flags=0, time=0) + ts = tables.tree_sequence() + assert list(ts.alignments()) == [] + + def test_boundary_sites_left_and_right(self): + # Sites at the boundaries 0 and L-1 overlay correctly + L = 5 + tables = tskit.TableCollection(L) + a = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + b = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + p = tables.nodes.add_row(time=1) + tables.edges.add_row(0, L, parent=p, child=a) + s0 = tables.sites.add_row(0, ancestral_state="A") + s4 = tables.sites.add_row(L - 1, ancestral_state="T") + # Mutate at position 0 for node a; position L-1 for node b + tables.mutations.add_row(site=s0, node=a, derived_state="G") + tables.mutations.add_row(site=s4, node=b, derived_state="C") + ts = tables.tree_sequence() + A = list(ts.alignments(reference_sequence="N" * L)) + assert A[0] == "GNNNT" + assert A[1] == "NNNNC" + + def test_reference_sequence_too_short_with_interval(self): + # Explicit ref shorter than [left,right) span should error + tables = tskit.TableCollection(10) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + ts = tables.tree_sequence() + with pytest.raises( + ValueError, match="must be equal to the tree sequence length" + ): + list(ts.alignments(reference_sequence="A" * 5, left=2, right=8)) + + def test_reference_sequence_length_must_match_sequence(self): + # Explicit ref length must match full sequence length + tables = tskit.TableCollection(10) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + ts = tables.tree_sequence() + with pytest.raises( + ValueError, match="must be equal to the tree sequence length" + ): + list(ts.alignments(reference_sequence="A" * 7, left=2, right=8)) + class TestAlignmentExamples: @pytest.mark.parametrize("ts", get_example_discrete_genome_tree_sequences()) def test_defaults(self, ts): - has_missing_data = np.any(ts.genotype_matrix() == -1) - if has_missing_data: - with pytest.raises(ValueError, match="1896"): - list(ts.alignments()) - else: - A = list(ts.alignments()) - assert len(A) == ts.num_samples - H = list(ts.haplotypes()) - pos = ts.tables.sites.position.astype(int) - for a, h in map(np.array, zip(A, H)): - last = 0 - for j, x in enumerate(pos): - assert a[last:x] == "N" * (x - last) - assert a[x] == h[j] - last = x + 1 + A = list(ts.alignments()) + assert len(A) == ts.num_samples + H = list(ts.haplotypes()) + pos = ts.tables.sites.position.astype(int) + for a, h in map(np.array, zip(A, H)): + last = 0 + for j, x in enumerate(pos): + assert a[last:x] == "N" * (x - last) + assert a[x] == h[j] + last = x + 1 @pytest.mark.parametrize("ts", get_example_discrete_genome_tree_sequences()) def test_reference_sequence(self, ts): ref = tskit.random_nucleotides(ts.sequence_length, seed=1234) - has_missing_data = np.any(ts.genotype_matrix() == -1) - if has_missing_data: - with pytest.raises(ValueError, match="1896"): - list(ts.alignments(reference_sequence=ref)) + A = list(ts.alignments(reference_sequence=ref, isolated_as_missing=False)) + assert len(A) == ts.num_samples + H = list(ts.haplotypes(isolated_as_missing=False)) + pos = ts.tables.sites.position.astype(int) + for a, h in map(np.array, zip(A, H)): + last = 0 + for j, x in enumerate(pos): + assert a[last:x] == ref[last:x] + assert a[x] == h[j] + last = x + 1 + assert a[last:] == ref[last:] + + +# Reference implementation for alignments (tests only) +def _reference_alignments( + ts, + *, + reference_sequence=None, + missing_data_character=None, + isolated_as_missing=None, + samples=None, + left=None, + right=None, +): + if not ts.discrete_genome: + raise ValueError("sequence alignments only defined for discrete genomes") + interval = ts._check_genomic_range(left, right, ensure_integer=True) + missing_data_character = ( + "N" if missing_data_character is None else missing_data_character + ) + if isolated_as_missing is None: + isolated_as_missing = True + L = interval.span + if reference_sequence is None: + if ts.has_reference_sequence(): + reference_sequence = ts.reference_sequence.data[ + interval.left : interval.right + ] + else: + reference_sequence = missing_data_character * L + if len(reference_sequence) != L: + raise ValueError( + "The reference sequence must be equal to the tree sequence length" + ) + ref_array = np.frombuffer(reference_sequence.encode("ascii"), dtype=np.int8) + H, (first_site_id, last_site_id) = ts._haplotypes_array( + interval=interval, + isolated_as_missing=isolated_as_missing, + missing_data_character=missing_data_character, + samples=samples, + ) + site_pos = ts.sites_position.astype(np.int64)[first_site_id : last_site_id + 1] + sample_ids = ts.samples() if samples is None else list(samples) + missing_val = ord(missing_data_character) + a = np.empty(L, dtype=np.int8) + for i, u in enumerate(sample_ids): + a[:] = ref_array + if isolated_as_missing: + for t in ts.trees(): + li = max(interval.left, int(t.interval.left)) + ri = min(interval.right, int(t.interval.right)) + if ri > li and t.is_isolated(u): + a[li - interval.left : ri - interval.left] = missing_val + if H.shape[1] > 0: + a[site_pos - interval.left] = H[i] + yield a.tobytes().decode("ascii") + + +class TestAlignmentsReferenceImpl: + @pytest.mark.parametrize( + "case", + [ + "default", + "isolated_false_with_ref", + "interval", + ], + ) + @pytest.mark.parametrize("ts", get_example_tree_sequences()) + def test_against_python_reference(self, ts, case): + kwargs = {} + L = int(ts.sequence_length) + if case == "isolated_false_with_ref": + kwargs = {"reference_sequence": "A" * L, "isolated_as_missing": False} + elif case == "interval": + L = int(ts.sequence_length) + if L <= 1: + left, right = 0, L + else: + left, right = 1, max(2, L - 1) + kwargs = {"left": left, "right": right} + try: + got = list(ts.alignments(**kwargs)) + except Exception as e1: + ex1 = e1 + got = None + else: + ex1 = None + try: + exp = list(_reference_alignments(ts, **kwargs)) + except Exception as e2: + ex2 = e2 + exp = None + else: + ex2 = None + if ex1 or ex2: + assert type(ex1) is type(ex2) else: - A = list(ts.alignments(reference_sequence=ref)) - assert len(A) == ts.num_samples - H = list(ts.haplotypes()) - pos = ts.tables.sites.position.astype(int) - for a, h in map(np.array, zip(A, H)): - last = 0 - for j, x in enumerate(pos): - assert a[last:x] == ref[last:x] - assert a[x] == h[j] - last = x + 1 - assert a[last:] == ref[last:] + assert got == exp # diff --git a/python/tests/test_phylo_formats.py b/python/tests/test_phylo_formats.py index 5a24dce0b7..f5bb2ae993 100644 --- a/python/tests/test_phylo_formats.py +++ b/python/tests/test_phylo_formats.py @@ -1316,7 +1316,6 @@ def test_A_reference(self): ts = alignment_example(20) self.verify(ts, reference_sequence="A" * 20) - @pytest.mark.skip("Missing data in alignments: #1896") def test_missing_data(self): self.verify(missing_data_example()) @@ -1344,7 +1343,6 @@ def test_no_reference(self): alignment_map = self.parse(text) assert get_alignment_map(ts) == alignment_map - @pytest.mark.skip("Missing data in alignments: #1896") def test_missing_data(self): ts = missing_data_example() text = ts.as_fasta() @@ -1422,7 +1420,6 @@ def test_nexus_missing_N(self): assert str(d["n0"][0]) == "N" -@pytest.mark.skip("Missing data in alignments: #1896") class TestDendropyMissingData: """ Test that we detect missing data correctly in dendropy under @@ -1462,10 +1459,9 @@ def assert_missing_data_encoded_A_ref(self, d): # Do we detect that we have an ambiguous state for the missing sample? a5 = d["n5"] - # a5 is missing along the full length of the genome, so all sites are - # missing. + # a5 is missing along the full length of the genome, so all sites are missing. assert all( - a5.state_denomination == dendropy.StateAlphabet.AMBIGUOUS_STATE + a5[j].state_denomination == dendropy.StateAlphabet.AMBIGUOUS_STATE for j in range(10) ) diff --git a/python/tskit/text_formats.py b/python/tskit/text_formats.py index 52a42483ec..e44668d6ad 100644 --- a/python/tskit/text_formats.py +++ b/python/tskit/text_formats.py @@ -119,6 +119,7 @@ def write_nexus( include_alignments, reference_sequence, missing_data_character, + isolated_as_missing=None, ): # See TreeSequence.write_nexus for documentation on parameters. if precision is None: @@ -154,6 +155,7 @@ def write_nexus( alignments = ts.alignments( reference_sequence=reference_sequence, missing_data_character=missing_data_character, + isolated_as_missing=isolated_as_missing, ) for u, alignment in zip(ts.samples(), alignments): print(2 * indent, f"n{u}", " ", alignment, sep="", file=out) @@ -196,6 +198,7 @@ def write_fasta( wrap_width, reference_sequence, missing_data_character, + isolated_as_missing=None, ): # See TreeSequence.write_fasta for documentation if wrap_width < 0 or int(wrap_width) != wrap_width: @@ -208,6 +211,7 @@ def write_fasta( alignments = ts.alignments( reference_sequence=reference_sequence, missing_data_character=missing_data_character, + isolated_as_missing=isolated_as_missing, ) for u, alignment in zip(ts.samples(), alignments): print(">", f"n{u}", sep="", file=output) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 760f7055f6..9d146a6e05 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5620,6 +5620,7 @@ def alignments( *, reference_sequence=None, missing_data_character=None, + isolated_as_missing=None, samples=None, left=None, right=None, @@ -5684,26 +5685,36 @@ def alignments( single byte characters, (i.e., variants must be single nucleotide polymorphisms, or SNPs). - .. warning:: :ref:`Missing data` is not - currently supported by this method and it will raise a ValueError - if called on tree sequences containing isolated samples. - See https://github.com/tskit-dev/tskit/issues/1896 for more - information. + Missing data handling + + - If ``isolated_as_missing=True`` (default), nodes that are isolated + (no parent and no children) are rendered as the missing character across + each tree interval. At site positions, the per-site allele overrides the + missing character; if a genotype is missing (``-1``), the missing + character is retained. + - If ``isolated_as_missing=False``, no missing overlay is applied. At sites, + genotypes are decoded as usual; at non-sites, bases come from the + reference sequence. See also the :meth:`.variants` iterator for site-centric access to sample genotypes and :meth:`.haplotypes` for access to sample sequences at just the sites in the tree sequence. :param str reference_sequence: The reference sequence to fill in - gaps between sites in the alignments. + gaps between sites in the alignments. If provided, it must be a + string of length equal to :attr:`.sequence_length`; the sequence is + sliced internally to the requested ``[left, right)`` interval. :param str missing_data_character: A single ascii character that will be used to represent missing data. If any normal allele contains this character, an error is raised. Default: 'N'. - :param list[int] samples: The samples for which to output alignments. If - ``None`` (default), return alignments for all the samples in the tree - sequence, in the order given by the :meth:`.samples` method. Only - sample nodes are supported for alignments. + :param bool isolated_as_missing: If True, treat isolated nodes as missing + across the covered tree intervals (see above). If None (default), this + is treated as True. + :param list[int] samples: The nodes for which to output alignments. If + ``None`` (default), return alignments for all sample nodes in the order + given by the :meth:`.samples` method. Non-sample nodes are also supported + and will be decoded at sites in the same way as samples. :param int left: Alignments will start at this genomic position. If ``None`` (default) alignments start at 0. :param int right: Alignments will stop before this genomic position. If ``None`` @@ -5723,73 +5734,61 @@ def alignments( "N" if missing_data_character is None else missing_data_character ) - # Temporary check until alignments support missing data properly - if samples is not None: - requested = list(samples) - n = self.num_nodes - flags = self.nodes_flags - if any( - (u < 0) or (u >= n) or ((flags[u] & NODE_IS_SAMPLE) == 0) - for u in requested - ): - raise tskit.LibraryError( - "Cannot generate genotypes for non-samples when isolated nodes " - "are considered as missing. (TSK_ERR_MUST_IMPUTE_NON_SAMPLES)" - ) + if isolated_as_missing is None: + isolated_as_missing = True L = interval.span a = np.empty(L, dtype=np.int8) - if reference_sequence is None: - if self.has_reference_sequence(): - # This may be inefficient - see #1989. However, since we're - # n copies of the reference sequence anyway, this is a relatively - # minor tweak. We may also want to recode the below not to use direct - # access to the .data attribute, e.g. if we allow reference sequences - # to start at non-zero positions - reference_sequence = self.reference_sequence.data[ - interval.left : interval.right - ] - else: - reference_sequence = missing_data_character * L - - if len(reference_sequence) != L: - if interval.right == int(self.sequence_length): - raise ValueError( - "The reference sequence is shorter than the tree sequence length" - ) - else: + full_ref = None + if reference_sequence is not None: + full_ref = reference_sequence + elif self.has_reference_sequence(): + # This may be inefficient - see #1989. However, since we're + # n copies of the reference sequence anyway, this is a relatively + # minor tweak. We may also want to recode the below not to use direct + # access to the .data attribute, e.g. if we allow reference sequences + # to start at non-zero positions + full_ref = self.reference_sequence.data + + if full_ref is None: + ref_slice = missing_data_character * L + else: + if len(full_ref) != int(self.sequence_length): raise ValueError( - "The reference sequence ends before the requested stop position" + "The reference sequence must be equal to the tree sequence length" ) - ref_bytes = reference_sequence.encode("ascii") - a[:] = np.frombuffer(ref_bytes, dtype=np.int8) - - # To do this properly we'll have to detect the missing data as - # part of a full implementation of alignments in C. The current - # definition might not be calling some degenerate cases correctly; - # see https://github.com/tskit-dev/tskit/issues/1908 - # - # Note also that this will call the presence of missing data - # incorrectly if have a sample isolated over the region (a, b], - # and if we have sites at each position from a to b, and at - # each site there is a mutation over the isolated sample. - if any(tree._has_isolated_samples() for tree in self.trees()): - raise ValueError( - "Missing data not currently supported in alignments; see " - "https://github.com/tskit-dev/tskit/issues/1896 for details." - "The current implementation may also incorrectly identify an " - "input tree sequence has having missing data." - ) + ref_slice = full_ref[interval.left : interval.right] + + # TODO Replace this readable Python version with a C backend + # Reusable reference buffer for this interval + ref_bytes = ref_slice.encode("ascii") + ref_array = np.frombuffer(ref_bytes, dtype=np.int8) + H, (first_site_id, last_site_id) = self._haplotypes_array( interval=interval, + isolated_as_missing=isolated_as_missing, missing_data_character=missing_data_character, samples=samples, ) site_pos = self.sites_position.astype(np.int64)[ first_site_id : last_site_id + 1 ] - for h in H: - a[site_pos - interval.left] = h + # Determine the requested node order + sample_ids = self.samples() if samples is None else list(samples) + missing_val = ord(missing_data_character) + for i, u in enumerate(sample_ids): + # Reset to the reference for this row + a[:] = ref_array + if isolated_as_missing: + # Mark isolated intervals as missing for this node + for t in self.trees(): + li = max(interval.left, int(t.interval.left)) + ri = min(interval.right, int(t.interval.right)) + if ri > li and t.is_isolated(u): + a[li - interval.left : ri - interval.left] = missing_val + # Overlay site alleles for this node + if H.shape[1] > 0: + a[site_pos - interval.left] = H[i] yield a.tobytes().decode("ascii") @property @@ -6706,6 +6705,7 @@ def write_fasta( wrap_width=60, reference_sequence=None, missing_data_character=None, + isolated_as_missing=None, ): """ Writes the :meth:`.alignments` for this tree sequence to file in @@ -6730,12 +6730,6 @@ def write_fasta( ts.write_fasta("output.fa") - .. warning:: :ref:`Missing data` is not - currently supported by this method and it will raise a ValueError - if called on tree sequences containing isolated samples. - See https://github.com/tskit-dev/tskit/issues/1896 for more - information. - :param file_or_path: The file object or path to write the output. Paths can be either strings or :class:`python:pathlib.Path` objects. :param int wrap_width: The number of sequence @@ -6744,6 +6738,7 @@ def write_fasta( (Default=60). :param str reference_sequence: As for the :meth:`.alignments` method. :param str missing_data_character: As for the :meth:`.alignments` method. + :param bool isolated_as_missing: As for the :meth:`.alignments` method. """ text_formats.write_fasta( self, @@ -6751,6 +6746,7 @@ def write_fasta( wrap_width=wrap_width, reference_sequence=reference_sequence, missing_data_character=missing_data_character, + isolated_as_missing=isolated_as_missing, ) def as_fasta(self, **kwargs): @@ -6774,6 +6770,7 @@ def write_nexus( include_alignments=None, reference_sequence=None, missing_data_character=None, + isolated_as_missing=None, ): """ Returns a `nexus encoding `_ @@ -6857,10 +6854,7 @@ def write_nexus( as our convention of using trees with multiple roots is not often supported by newick parsers. Thus, the method will raise a ValueError if we try to output trees with - multiple roots. Additionally, missing data - is not currently supported for alignment data. - See https://github.com/tskit-dev/tskit/issues/1896 for more - information. + multiple roots. .. seealso: See also the :meth:`.as_nexus` method which will return this nexus representation as a string. @@ -6875,6 +6869,7 @@ def write_nexus( :param str reference_sequence: As for the :meth:`.alignments` method. :param str missing_data_character: As for the :meth:`.alignments` method, but defaults to "?". + :param bool isolated_as_missing: As for the :meth:`.alignments` method. :return: A nexus representation of this :class:`TreeSequence` :rtype: str """ @@ -6886,6 +6881,7 @@ def write_nexus( include_alignments=include_alignments, reference_sequence=reference_sequence, missing_data_character=missing_data_character, + isolated_as_missing=isolated_as_missing, ) def as_nexus(self, **kwargs): diff --git a/python/tskit/util.py b/python/tskit/util.py index 3727a40191..64fad3f423 100644 --- a/python/tskit/util.py +++ b/python/tskit/util.py @@ -776,9 +776,8 @@ def variant_html(variant): """ + "\n".join( [ - "Nodes with Allele " - + ("missing" if k is None else "'" + k + "'") - + "" + f"""Nodes with Allele {'missing' if k is None + else "'" + k + "'"}""" + f"{format_number(counts[k])}" + " " + f"({format_number(freqs[k] * 100, 2)}%)"