Skip to content

Commit fe7724c

Browse files
committed
Have ref seq always be equal to sequence length
1 parent f5c89e0 commit fe7724c

File tree

2 files changed

+59
-55
lines changed

2 files changed

+59
-55
lines changed

python/tests/test_genotypes.py

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1697,7 +1697,7 @@ def test_alignments_left_right_subinterval(self):
16971697
ts = self.ts()
16981698
# Use a custom reference and a subinterval [2, 8)
16991699
ref = "A" * 10
1700-
got = list(ts.alignments(reference_sequence=ref[2:8], left=2, right=8))
1700+
got = list(ts.alignments(reference_sequence=ref, left=2, right=8))
17011701
assert got == ["GAAAAA", "AAAAAA", "AAAAAA", "NNNNNN"]
17021702

17031703
def test_fasta_reference_sequence(self):
@@ -1851,17 +1851,19 @@ def build_ts(self):
18511851
def test_whole_window_missing_at_ends(self):
18521852
ts = self.build_ts()
18531853
ref = "0123456789"
1854-
# Node is isolated outside [3,7): expect missing there; inside use ref, with site overlay at 5
1854+
# Node is isolated outside [3,7): expect missing there; inside use ref,
1855+
# with site overlay at 5
18551856
got = list(ts.alignments(samples=[1], reference_sequence=ref))
18561857
assert got == ["NNN34G6NNN"]
18571858

18581859
def test_subwindow(self):
18591860
ts = self.build_ts()
18601861
ref = "0123456789"
18611862
# Request [2,8): expect missing at 2 and 7, ref inside, site overlay at 5
1862-
got = list(ts.alignments(samples=[1], reference_sequence=ref[2:8], left=2, right=8))
1863+
got = list(ts.alignments(samples=[1], reference_sequence=ref, left=2, right=8))
18631864
assert got == ["N34G6N"]
18641865

1866+
18651867
class TestMultiRootExample:
18661868
# 1.00┊ 4 5 ┊
18671869
# ┊ ┏┻┓ ┏┻┓ ┊
@@ -2234,17 +2236,17 @@ def test_reference_length_mismatch(self, ref_length):
22342236
tables = tskit.TableCollection(10)
22352237
tables.reference_sequence.data = "A" * ref_length
22362238
ts = tables.tree_sequence()
2237-
if ref_length <= tables.sequence_length:
2238-
with pytest.raises(ValueError, match="shorter than"):
2239-
list(ts.alignments())
2240-
else:
2241-
# Longer reference sequences are allowed
2239+
with pytest.raises(
2240+
ValueError, match="must be equal to the tree sequence length"
2241+
):
22422242
list(ts.alignments())
22432243

22442244
@pytest.mark.parametrize("ref", ["", "xy"])
22452245
def test_reference_sequence_length_mismatch(self, ref):
22462246
ts = self.simplest_ts()
2247-
with pytest.raises(ValueError, match="shorter than"):
2247+
with pytest.raises(
2248+
ValueError, match="must be equal to the tree sequence length"
2249+
):
22482250
list(ts.alignments(reference_sequence=ref))
22492251

22502252
@pytest.mark.parametrize("ref", ["À", "┃", "α"])
@@ -2308,7 +2310,9 @@ def test_bad_restricted(self):
23082310
tables = tskit.TableCollection(10)
23092311
tables.reference_sequence.data = "A" * 7
23102312
ts = tables.tree_sequence()
2311-
with pytest.raises(ValueError, match="sequence ends before"):
2313+
with pytest.raises(
2314+
ValueError, match="must be equal to the tree sequence length"
2315+
):
23122316
list(ts.alignments(right=8))
23132317

23142318
def test_no_samples_default(self):
@@ -2342,16 +2346,20 @@ def test_reference_sequence_too_short_with_interval(self):
23422346
tables = tskit.TableCollection(10)
23432347
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
23442348
ts = tables.tree_sequence()
2345-
with pytest.raises(ValueError, match="ends before the requested stop position"):
2346-
list(ts.alignments(reference_sequence="A" * 5, left=2, right=8)) # L=6
2349+
with pytest.raises(
2350+
ValueError, match="must be equal to the tree sequence length"
2351+
):
2352+
list(ts.alignments(reference_sequence="A" * 5, left=2, right=8))
23472353

2348-
def test_reference_sequence_too_long_with_interval(self):
2349-
# Explicit ref longer than [left,right) span should also error
2354+
def test_reference_sequence_length_must_match_sequence(self):
2355+
# Explicit ref length must match full sequence length
23502356
tables = tskit.TableCollection(10)
23512357
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
23522358
ts = tables.tree_sequence()
2353-
with pytest.raises(ValueError, match="ends before the requested stop position"):
2354-
list(ts.alignments(reference_sequence="A" * 7, left=2, right=8)) # L=6
2359+
with pytest.raises(
2360+
ValueError, match="must be equal to the tree sequence length"
2361+
):
2362+
list(ts.alignments(reference_sequence="A" * 7, left=2, right=8))
23552363

23562364

23572365
class TestAlignmentExamples:
@@ -2412,14 +2420,9 @@ def _reference_alignments(
24122420
else:
24132421
reference_sequence = missing_data_character * L
24142422
if len(reference_sequence) != L:
2415-
if interval.right == int(ts.sequence_length):
2416-
raise ValueError(
2417-
"The reference sequence is shorter than the tree sequence length"
2418-
)
2419-
else:
2420-
raise ValueError(
2421-
"The reference sequence ends before the requested stop position"
2422-
)
2423+
raise ValueError(
2424+
"The reference sequence must be equal to the tree sequence length"
2425+
)
24232426
ref_array = np.frombuffer(reference_sequence.encode("ascii"), dtype=np.int8)
24242427
H, (first_site_id, last_site_id) = ts._haplotypes_array(
24252428
interval=interval,

python/tskit/trees.py

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -5685,21 +5685,25 @@ def alignments(
56855685
single byte characters, (i.e., variants must be single nucleotide
56865686
polymorphisms, or SNPs).
56875687
5688-
Missing data handling:
5689-
- If ``isolated_as_missing=True`` (default), nodes that are isolated in a
5690-
tree (no parent and no children) are rendered as the missing character
5691-
across for all bases at that tree's interval except for sites with a
5692-
mutation directly above the node.
5693-
- If ``isolated_as_missing=False``, no missing-overlay is applied. At sites,
5694-
genotypes are decoded as usual; at non-sites, characters come from the
5688+
Missing data handling
5689+
5690+
- If ``isolated_as_missing=True`` (default), nodes that are isolated
5691+
(no parent and no children) are rendered as the missing character across
5692+
each tree interval. At site positions, the per-site allele overrides the
5693+
missing character; if a genotype is missing (``-1``), the missing
5694+
character is retained.
5695+
- If ``isolated_as_missing=False``, no missing overlay is applied. At sites,
5696+
genotypes are decoded as usual; at non-sites, bases come from the
56955697
reference sequence.
56965698
56975699
See also the :meth:`.variants` iterator for site-centric access
56985700
to sample genotypes and :meth:`.haplotypes` for access to sample sequences
56995701
at just the sites in the tree sequence.
57005702
57015703
:param str reference_sequence: The reference sequence to fill in
5702-
gaps between sites in the alignments.
5704+
gaps between sites in the alignments. If provided, it must be a
5705+
string of length equal to :attr:`.sequence_length`; the sequence is
5706+
sliced internally to the requested ``[left, right)`` interval.
57035707
:param str missing_data_character: A single ascii character that will
57045708
be used to represent missing data.
57055709
If any normal allele contains this character, an error is raised.
@@ -5735,32 +5739,29 @@ def alignments(
57355739

57365740
L = interval.span
57375741
a = np.empty(L, dtype=np.int8)
5738-
if reference_sequence is None:
5739-
if self.has_reference_sequence():
5740-
# This may be inefficient - see #1989. However, since we're
5741-
# n copies of the reference sequence anyway, this is a relatively
5742-
# minor tweak. We may also want to recode the below not to use direct
5743-
# access to the .data attribute, e.g. if we allow reference sequences
5744-
# to start at non-zero positions
5745-
reference_sequence = self.reference_sequence.data[
5746-
interval.left : interval.right
5747-
]
5748-
else:
5749-
reference_sequence = missing_data_character * L
5750-
5751-
if len(reference_sequence) != L:
5752-
if interval.right == int(self.sequence_length):
5753-
raise ValueError(
5754-
"The reference sequence is shorter than the tree sequence length"
5755-
)
5756-
else:
5742+
full_ref = None
5743+
if reference_sequence is not None:
5744+
full_ref = reference_sequence
5745+
elif self.has_reference_sequence():
5746+
# This may be inefficient - see #1989. However, since we're
5747+
# n copies of the reference sequence anyway, this is a relatively
5748+
# minor tweak. We may also want to recode the below not to use direct
5749+
# access to the .data attribute, e.g. if we allow reference sequences
5750+
# to start at non-zero positions
5751+
full_ref = self.reference_sequence.data
5752+
5753+
if full_ref is None:
5754+
ref_slice = missing_data_character * L
5755+
else:
5756+
if len(full_ref) != int(self.sequence_length):
57575757
raise ValueError(
5758-
"The reference sequence ends before the requested stop position"
5758+
"The reference sequence must be equal to the tree sequence length"
57595759
)
5760+
ref_slice = full_ref[interval.left : interval.right]
57605761

5761-
# TODO Replace this as simple-as-possible readable Python version with C
5762-
# Reusable reference buffer
5763-
ref_bytes = reference_sequence.encode("ascii")
5762+
# TODO Replace this readable Python version with a C backend
5763+
# Reusable reference buffer for this interval
5764+
ref_bytes = ref_slice.encode("ascii")
57645765
ref_array = np.frombuffer(ref_bytes, dtype=np.int8)
57655766

57665767
H, (first_site_id, last_site_id) = self._haplotypes_array(

0 commit comments

Comments
 (0)