From 6259fa96189ead957162cfdc0cf25f3d10c64b9c Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Wed, 22 Oct 2025 17:02:02 +0100 Subject: [PATCH 1/3] Remove guard against internal samples when decoding genotypes, instead checking when they are isolated --- c/tests/test_genotypes.c | 91 ++++++++++++++++++++++++++++++++-- c/tskit/core.c | 4 -- c/tskit/core.h | 6 --- c/tskit/genotypes.c | 66 ++++++++++++++++++------ c/tskit/genotypes.h | 1 + python/tests/test_genotypes.py | 21 +++++++- 6 files changed, 159 insertions(+), 30 deletions(-) diff --git a/c/tests/test_genotypes.c b/c/tests/test_genotypes.c index fd597439e0..27c32cc424 100644 --- a/c/tests/test_genotypes.c +++ b/c/tests/test_genotypes.c @@ -512,10 +512,48 @@ test_single_tree_non_samples(void) tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); - /* It's an error to hand in non-samples without imputation turned on */ ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUST_IMPUTE_NON_SAMPLES); - tsk_vargen_free(&vargen); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_vargen_print_state(&vargen, _devnull); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_FALSE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->num_alleles, 2); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); + CU_ASSERT_EQUAL(var->site.id, 0); + CU_ASSERT_EQUAL(var->site.mutations_length, 1); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_FALSE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->num_alleles, 2); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); + CU_ASSERT_EQUAL(var->site.id, 1); + CU_ASSERT_EQUAL(var->site.mutations_length, 2); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_FALSE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->num_alleles, 2); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); + CU_ASSERT_EQUAL(var->site.id, 2); + CU_ASSERT_EQUAL(var->site.mutations_length, 4); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_vargen_free(&vargen); + CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, TSK_ISOLATED_NOT_MISSING); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -559,6 +597,52 @@ test_single_tree_non_samples(void) tsk_treeseq_free(&ts); } +static void +test_internal_node_missing_data(void) +{ + int ret = 0; + const char *nodes = "1 0 -1 -1\n" + "1 0 -1 -1\n" + "0 1 -1 -1\n" + "0 1 -1 -1\n"; + const char *edges = "0 1 2 0\n" + "0 1 2 1\n" + "1 2 3 0\n" + "1 2 3 1\n"; + const char *sites = "0.5 0\n" + "1.5 0\n"; + tsk_treeseq_t ts; + tsk_vargen_t vargen; + tsk_variant_t *var; + tsk_id_t samples[] = { 2, 3 }; + + tsk_treeseq_from_text(&ts, 2, nodes, edges, NULL, sites, NULL, NULL, NULL, 0); + + ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_TRUE(var->has_missing_data); + CU_ASSERT_EQUAL(var->site.id, 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], TSK_MISSING_DATA); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_TRUE(var->has_missing_data); + CU_ASSERT_EQUAL(var->site.id, 1); + CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_vargen_free(&vargen); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_treeseq_free(&ts); +} + static void test_single_tree_errors(void) { @@ -1123,6 +1207,7 @@ main(int argc, char **argv) { "test_single_tree_char_alphabet", test_single_tree_char_alphabet }, { "test_single_tree_binary_alphabet", test_single_tree_binary_alphabet }, { "test_single_tree_non_samples", test_single_tree_non_samples }, + { "test_internal_node_missing_data", test_internal_node_missing_data }, { "test_single_tree_errors", test_single_tree_errors }, { "test_single_tree_user_alleles_errors", test_single_tree_user_alleles_errors }, { "test_single_tree_subsample", test_single_tree_subsample }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 5e5f828943..d4782e2d16 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -569,10 +569,6 @@ tsk_strerror_internal(int err) break; /* Genotype decoding errors */ - case TSK_ERR_MUST_IMPUTE_NON_SAMPLES: - ret = "Cannot generate genotypes for non-samples when isolated nodes are " - "considered as missing. (TSK_ERR_MUST_IMPUTE_NON_SAMPLES)"; - break; case TSK_ERR_ALLELE_NOT_FOUND: ret = "An allele was not found in the user-specified allele map. " "(TSK_ERR_ALLELE_NOT_FOUND)"; diff --git a/c/tskit/core.h b/c/tskit/core.h index 76ac086957..9f0ae31531 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -785,12 +785,6 @@ than 0. @{ */ /** -Genotypes were requested for non-samples at the same time -as asking that isolated nodes be marked as missing. This is not -supported. -*/ -#define TSK_ERR_MUST_IMPUTE_NON_SAMPLES -1100 -/** A user-specified allele map was used, but didn't contain an allele found in the tree sequence. */ diff --git a/c/tskit/genotypes.c b/c/tskit/genotypes.c index c2385281bd..9e5dbfa8d3 100644 --- a/c/tskit/genotypes.c +++ b/c/tskit/genotypes.c @@ -90,12 +90,10 @@ tsk_variant_copy_alleles(tsk_variant_t *self, const char **alleles) static int variant_init_samples_and_index_map(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples, - size_t num_samples_alloc, tsk_flags_t options) + size_t num_samples_alloc, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; - const tsk_flags_t *flags = tree_sequence->tables->nodes.flags; tsk_size_t j, num_nodes; - bool impute_missing = !!(options & TSK_ISOLATED_NOT_MISSING); tsk_id_t u; num_nodes = tsk_treeseq_get_num_nodes(tree_sequence); @@ -120,11 +118,6 @@ variant_init_samples_and_index_map(tsk_variant_t *self, ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } - /* We can only detect missing data for samples */ - if (!impute_missing && !(flags[u] & TSK_NODE_IS_SAMPLE)) { - ret = tsk_trace_error(TSK_ERR_MUST_IMPUTE_NON_SAMPLES); - goto out; - } self->alt_sample_index_map[samples[j]] = (tsk_id_t) j; } out: @@ -218,8 +211,8 @@ tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, } self->sample_index_map = self->alt_sample_index_map; } - /* When a list of samples is given, we use the traversal based algorithm - * which doesn't use sample list tracking in the tree */ + /* When a list of samples is given, we use the traversal based algorithm, + * so we need traversal scratch space and presence tracking. */ if (self->alt_samples != NULL) { self->traversal_stack = tsk_malloc( tsk_treeseq_get_num_nodes(tree_sequence) * sizeof(*self->traversal_stack)); @@ -227,6 +220,12 @@ tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } + self->sample_is_present + = tsk_malloc(num_samples_alloc * sizeof(*self->sample_is_present)); + if (self->sample_is_present == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); + goto out; + } } self->genotypes = tsk_malloc(num_samples_alloc * sizeof(*self->genotypes)); @@ -274,6 +273,7 @@ tsk_variant_free(tsk_variant_t *self) tsk_safe_free(self->samples); tsk_safe_free(self->alt_samples); tsk_safe_free(self->alt_sample_index_map); + tsk_safe_free(self->sample_is_present); tsk_safe_free(self->traversal_stack); return 0; } @@ -425,13 +425,48 @@ tsk_variant_mark_missing(tsk_variant_t *self) const tsk_id_t *restrict sample_index_map = self->sample_index_map; const tsk_id_t N = self->tree.virtual_root; int32_t *restrict genotypes = self->genotypes; - tsk_id_t root, sample_index; + tsk_id_t root, sample_index, u, v; + const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; + bool *restrict present = self->sample_is_present; + tsk_id_t *restrict stack = self->traversal_stack; + int stack_top = -1; + tsk_size_t j = 0; + + if (self->alt_samples == NULL) { + for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { + if (left_child[root] == TSK_NULL) { + sample_index = sample_index_map[root]; + if (sample_index != TSK_NULL) { + genotypes[sample_index] = TSK_MISSING_DATA; + num_missing++; + } + } + } + } else { + tsk_memset(present, 0, self->num_samples * sizeof(*present)); + + for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { + stack[++stack_top] = root; + } - for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { - if (left_child[root] == TSK_NULL) { - sample_index = sample_index_map[root]; + while (stack_top >= 0) { + u = stack[stack_top--]; + sample_index = sample_index_map[u]; if (sample_index != TSK_NULL) { - genotypes[sample_index] = TSK_MISSING_DATA; + present[sample_index] = true; + } + for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { + stack[++stack_top] = v; + } + } + + for (j = 0; j < self->num_samples; j++) { + u = self->samples[j]; + if (!present[j] + || ((flags[u] & TSK_NODE_IS_SAMPLE) != 0 + && self->tree.parent[u] == TSK_NULL + && left_child[u] == TSK_NULL)) { + genotypes[j] = TSK_MISSING_DATA; num_missing++; } } @@ -591,6 +626,7 @@ tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other) other->sample_index_map = NULL; other->alt_samples = NULL; other->alt_sample_index_map = NULL; + other->sample_is_present = NULL; other->user_alleles_mem = NULL; total_len = 0; diff --git a/c/tskit/genotypes.h b/c/tskit/genotypes.h index 8c3b769e5a..1388ba1139 100644 --- a/c/tskit/genotypes.h +++ b/c/tskit/genotypes.h @@ -70,6 +70,7 @@ typedef struct { tsk_id_t *samples; const tsk_id_t *sample_index_map; + bool *sample_is_present; bool user_alleles; char *user_alleles_mem; tsk_id_t *traversal_stack; diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index e8af9841a1..ae0c4af4fc 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -1283,8 +1283,8 @@ def test_empty_samples(self): def test_non_sample_samples(self): ts = self.ts() - with pytest.raises(tskit.LibraryError, match="MUST_IMPUTE_NON_SAMPLES"): - list(ts.alignments(samples=[4])) + alignment = list(ts.alignments(samples=[4])) + assert alignment == ["NNANNNNNNT"] def test_alignments_missing_data_char(self): A = list(self.ts().alignments(missing_data_character="x")) @@ -1308,6 +1308,23 @@ def test_alignments_reference_sequence_embedded_null(self): assert A[1] == "01A3\x005678C" assert A[2] == "01A3\x005678C" + def test_internal_node_missing_alignments(self): + tables = tskit.TableCollection(sequence_length=2) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + tables.nodes.add_row(time=1) + tables.nodes.add_row(time=1) + tables.edges.add_row(left=0, right=1, parent=2, child=0) + tables.edges.add_row(left=0, right=1, parent=2, child=1) + tables.edges.add_row(left=1, right=2, parent=3, child=0) + tables.edges.add_row(left=1, right=2, parent=3, child=1) + tables.sites.add_row(position=0, ancestral_state="A") + tables.sites.add_row(position=1, ancestral_state="C") + ts = tables.tree_sequence() + alignments = list(ts.alignments(samples=[2, 3])) + assert alignments[0] == "AN" + assert alignments[1] == "NC" + def test_fasta_default(self): expected = textwrap.dedent( """\ From ff0e9f6c3828d0b4cc5eaaca89da0a122d562b35 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Thu, 23 Oct 2025 09:57:03 +0100 Subject: [PATCH 2/3] Consistency tweaks --- c/tskit/genotypes.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/c/tskit/genotypes.c b/c/tskit/genotypes.c index 9e5dbfa8d3..bb1205674e 100644 --- a/c/tskit/genotypes.c +++ b/c/tskit/genotypes.c @@ -216,13 +216,9 @@ tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, if (self->alt_samples != NULL) { self->traversal_stack = tsk_malloc( tsk_treeseq_get_num_nodes(tree_sequence) * sizeof(*self->traversal_stack)); - if (self->traversal_stack == NULL) { - ret = tsk_trace_error(TSK_ERR_NO_MEMORY); - goto out; - } self->sample_is_present = tsk_malloc(num_samples_alloc * sizeof(*self->sample_is_present)); - if (self->sample_is_present == NULL) { + if (self->traversal_stack == NULL || self->sample_is_present == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } @@ -446,17 +442,20 @@ tsk_variant_mark_missing(tsk_variant_t *self) tsk_memset(present, 0, self->num_samples * sizeof(*present)); for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { - stack[++stack_top] = root; + stack_top++; + stack[stack_top] = root; } while (stack_top >= 0) { - u = stack[stack_top--]; + u = stack[stack_top]; + stack_top--; sample_index = sample_index_map[u]; if (sample_index != TSK_NULL) { present[sample_index] = true; } for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - stack[++stack_top] = v; + stack_top++; + stack[stack_top] = v; } } From dc9e33bd6a41522ecf61676eb52759a9d6c1e2f6 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Thu, 23 Oct 2025 12:53:54 +0100 Subject: [PATCH 3/3] Only calculate visited nodes per-tree --- c/tskit/genotypes.c | 90 +++++++++++++++++++++++++++++---------------- c/tskit/genotypes.h | 1 + 2 files changed, 60 insertions(+), 31 deletions(-) diff --git a/c/tskit/genotypes.c b/c/tskit/genotypes.c index bb1205674e..922c45edd9 100644 --- a/c/tskit/genotypes.c +++ b/c/tskit/genotypes.c @@ -134,6 +134,7 @@ tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, tsk_size_t num_samples_alloc; tsk_memset(self, 0, sizeof(tsk_variant_t)); + self->missingmess_cache_tree_index = TSK_NULL; /* Set site id to NULL to indicate the variant is not decoded */ self->site.id = TSK_NULL; @@ -412,6 +413,55 @@ tsk_variant_update_genotypes_traversal( return tsk_variant_traverse(self, node, derived, tsk_variant_visit); } +static void +tsk_variant_update_missing_cache(tsk_variant_t *self) +{ + const tsk_id_t *restrict left_child = self->tree.left_child; + const tsk_id_t *restrict right_sib = self->tree.right_sib; + const tsk_id_t *restrict sample_index_map = self->sample_index_map; + const tsk_id_t N = self->tree.virtual_root; + const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; + bool *restrict present = self->sample_is_present; + tsk_id_t *restrict stack = self->traversal_stack; + tsk_id_t root, u, v, sample_index; + tsk_size_t j; + int stack_top; + + tsk_bug_assert(self->alt_samples != NULL); + tsk_bug_assert(self->tree.index != TSK_NULL); + + tsk_memset(present, 0, self->num_samples * sizeof(*present)); + + stack_top = -1; + for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { + stack_top++; + stack[stack_top] = root; + } + + while (stack_top >= 0) { + u = stack[stack_top]; + stack_top--; + sample_index = sample_index_map[u]; + if (sample_index != TSK_NULL) { + present[sample_index] = true; + } + for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { + stack_top++; + stack[stack_top] = v; + } + } + + for (j = 0; j < self->num_samples; j++) { + u = self->samples[j]; + present[j] + = present[j] + && !((flags[u] & TSK_NODE_IS_SAMPLE) != 0 + && self->tree.parent[u] == TSK_NULL && left_child[u] == TSK_NULL); + } + + self->missingmess_cache_tree_index = self->tree.index; +} + static tsk_size_t tsk_variant_mark_missing(tsk_variant_t *self) { @@ -421,12 +471,8 @@ tsk_variant_mark_missing(tsk_variant_t *self) const tsk_id_t *restrict sample_index_map = self->sample_index_map; const tsk_id_t N = self->tree.virtual_root; int32_t *restrict genotypes = self->genotypes; - tsk_id_t root, sample_index, u, v; - const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; - bool *restrict present = self->sample_is_present; - tsk_id_t *restrict stack = self->traversal_stack; - int stack_top = -1; - tsk_size_t j = 0; + tsk_id_t root, sample_index; + tsk_size_t j; if (self->alt_samples == NULL) { for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { @@ -439,37 +485,14 @@ tsk_variant_mark_missing(tsk_variant_t *self) } } } else { - tsk_memset(present, 0, self->num_samples * sizeof(*present)); - - for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { - stack_top++; - stack[stack_top] = root; - } - - while (stack_top >= 0) { - u = stack[stack_top]; - stack_top--; - sample_index = sample_index_map[u]; - if (sample_index != TSK_NULL) { - present[sample_index] = true; - } - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - stack_top++; - stack[stack_top] = v; - } - } - for (j = 0; j < self->num_samples; j++) { - u = self->samples[j]; - if (!present[j] - || ((flags[u] & TSK_NODE_IS_SAMPLE) != 0 - && self->tree.parent[u] == TSK_NULL - && left_child[u] == TSK_NULL)) { + if (!self->sample_is_present[j]) { genotypes[j] = TSK_MISSING_DATA; num_missing++; } } } + return num_missing; } @@ -570,6 +593,10 @@ tsk_variant_decode( * mutations directly over samples should not be missing */ num_missing = 0; if (!impute_missing) { + if (self->alt_samples != NULL + && self->missingmess_cache_tree_index != self->tree.index) { + tsk_variant_update_missing_cache(self); + } num_missing = mark_missing(self); } for (j = 0; j < self->site.mutations_length; j++) { @@ -627,6 +654,7 @@ tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other) other->alt_sample_index_map = NULL; other->sample_is_present = NULL; other->user_alleles_mem = NULL; + other->missingmess_cache_tree_index = TSK_NULL; total_len = 0; for (j = 0; j < self->num_alleles; j++) { diff --git a/c/tskit/genotypes.h b/c/tskit/genotypes.h index 1388ba1139..29f4eef2da 100644 --- a/c/tskit/genotypes.h +++ b/c/tskit/genotypes.h @@ -77,6 +77,7 @@ typedef struct { tsk_flags_t options; tsk_id_t *alt_samples; tsk_id_t *alt_sample_index_map; + tsk_id_t missingmess_cache_tree_index; } tsk_variant_t;