Skip to content

Commit 6259fa9

Browse files
committed
Remove guard against internal samples when decoding genotypes, instead checking when they are isolated
1 parent f43ab1f commit 6259fa9

File tree

6 files changed

+159
-30
lines changed

6 files changed

+159
-30
lines changed

c/tests/test_genotypes.c

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -512,10 +512,48 @@ test_single_tree_non_samples(void)
512512

513513
tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL,
514514
single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0);
515-
/* It's an error to hand in non-samples without imputation turned on */
516515
ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0);
517-
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUST_IMPUTE_NON_SAMPLES);
518-
tsk_vargen_free(&vargen);
516+
CU_ASSERT_EQUAL_FATAL(ret, 0);
517+
tsk_vargen_print_state(&vargen, _devnull);
518+
519+
ret = tsk_vargen_next(&vargen, &var);
520+
CU_ASSERT_EQUAL_FATAL(ret, 1);
521+
CU_ASSERT_FALSE(var->has_missing_data);
522+
CU_ASSERT_EQUAL(var->genotypes[0], 0);
523+
CU_ASSERT_EQUAL(var->genotypes[1], 0);
524+
CU_ASSERT_EQUAL(var->num_alleles, 2);
525+
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
526+
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
527+
CU_ASSERT_EQUAL(var->site.id, 0);
528+
CU_ASSERT_EQUAL(var->site.mutations_length, 1);
529+
530+
ret = tsk_vargen_next(&vargen, &var);
531+
CU_ASSERT_EQUAL_FATAL(ret, 1);
532+
CU_ASSERT_FALSE(var->has_missing_data);
533+
CU_ASSERT_EQUAL(var->genotypes[1], 0);
534+
CU_ASSERT_EQUAL(var->genotypes[0], 1);
535+
CU_ASSERT_EQUAL(var->num_alleles, 2);
536+
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
537+
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
538+
CU_ASSERT_EQUAL(var->site.id, 1);
539+
CU_ASSERT_EQUAL(var->site.mutations_length, 2);
540+
541+
ret = tsk_vargen_next(&vargen, &var);
542+
CU_ASSERT_EQUAL_FATAL(ret, 1);
543+
CU_ASSERT_FALSE(var->has_missing_data);
544+
CU_ASSERT_EQUAL(var->genotypes[0], 0);
545+
CU_ASSERT_EQUAL(var->genotypes[1], 0);
546+
CU_ASSERT_EQUAL(var->num_alleles, 2);
547+
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
548+
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
549+
CU_ASSERT_EQUAL(var->site.id, 2);
550+
CU_ASSERT_EQUAL(var->site.mutations_length, 4);
551+
552+
ret = tsk_vargen_next(&vargen, &var);
553+
CU_ASSERT_EQUAL_FATAL(ret, 0);
554+
555+
ret = tsk_vargen_free(&vargen);
556+
CU_ASSERT_EQUAL_FATAL(ret, 0);
519557

520558
ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, TSK_ISOLATED_NOT_MISSING);
521559
CU_ASSERT_EQUAL_FATAL(ret, 0);
@@ -559,6 +597,52 @@ test_single_tree_non_samples(void)
559597
tsk_treeseq_free(&ts);
560598
}
561599

600+
static void
601+
test_internal_node_missing_data(void)
602+
{
603+
int ret = 0;
604+
const char *nodes = "1 0 -1 -1\n"
605+
"1 0 -1 -1\n"
606+
"0 1 -1 -1\n"
607+
"0 1 -1 -1\n";
608+
const char *edges = "0 1 2 0\n"
609+
"0 1 2 1\n"
610+
"1 2 3 0\n"
611+
"1 2 3 1\n";
612+
const char *sites = "0.5 0\n"
613+
"1.5 0\n";
614+
tsk_treeseq_t ts;
615+
tsk_vargen_t vargen;
616+
tsk_variant_t *var;
617+
tsk_id_t samples[] = { 2, 3 };
618+
619+
tsk_treeseq_from_text(&ts, 2, nodes, edges, NULL, sites, NULL, NULL, NULL, 0);
620+
621+
ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0);
622+
CU_ASSERT_EQUAL_FATAL(ret, 0);
623+
624+
ret = tsk_vargen_next(&vargen, &var);
625+
CU_ASSERT_EQUAL_FATAL(ret, 1);
626+
CU_ASSERT_TRUE(var->has_missing_data);
627+
CU_ASSERT_EQUAL(var->site.id, 0);
628+
CU_ASSERT_EQUAL(var->genotypes[0], 0);
629+
CU_ASSERT_EQUAL(var->genotypes[1], TSK_MISSING_DATA);
630+
631+
ret = tsk_vargen_next(&vargen, &var);
632+
CU_ASSERT_EQUAL_FATAL(ret, 1);
633+
CU_ASSERT_TRUE(var->has_missing_data);
634+
CU_ASSERT_EQUAL(var->site.id, 1);
635+
CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA);
636+
CU_ASSERT_EQUAL(var->genotypes[1], 0);
637+
638+
ret = tsk_vargen_next(&vargen, &var);
639+
CU_ASSERT_EQUAL_FATAL(ret, 0);
640+
641+
ret = tsk_vargen_free(&vargen);
642+
CU_ASSERT_EQUAL_FATAL(ret, 0);
643+
tsk_treeseq_free(&ts);
644+
}
645+
562646
static void
563647
test_single_tree_errors(void)
564648
{
@@ -1123,6 +1207,7 @@ main(int argc, char **argv)
11231207
{ "test_single_tree_char_alphabet", test_single_tree_char_alphabet },
11241208
{ "test_single_tree_binary_alphabet", test_single_tree_binary_alphabet },
11251209
{ "test_single_tree_non_samples", test_single_tree_non_samples },
1210+
{ "test_internal_node_missing_data", test_internal_node_missing_data },
11261211
{ "test_single_tree_errors", test_single_tree_errors },
11271212
{ "test_single_tree_user_alleles_errors", test_single_tree_user_alleles_errors },
11281213
{ "test_single_tree_subsample", test_single_tree_subsample },

c/tskit/core.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -569,10 +569,6 @@ tsk_strerror_internal(int err)
569569
break;
570570

571571
/* Genotype decoding errors */
572-
case TSK_ERR_MUST_IMPUTE_NON_SAMPLES:
573-
ret = "Cannot generate genotypes for non-samples when isolated nodes are "
574-
"considered as missing. (TSK_ERR_MUST_IMPUTE_NON_SAMPLES)";
575-
break;
576572
case TSK_ERR_ALLELE_NOT_FOUND:
577573
ret = "An allele was not found in the user-specified allele map. "
578574
"(TSK_ERR_ALLELE_NOT_FOUND)";

c/tskit/core.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -785,12 +785,6 @@ than 0.
785785
@{
786786
*/
787787
/**
788-
Genotypes were requested for non-samples at the same time
789-
as asking that isolated nodes be marked as missing. This is not
790-
supported.
791-
*/
792-
#define TSK_ERR_MUST_IMPUTE_NON_SAMPLES -1100
793-
/**
794788
A user-specified allele map was used, but didn't contain an allele
795789
found in the tree sequence.
796790
*/

c/tskit/genotypes.c

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,10 @@ tsk_variant_copy_alleles(tsk_variant_t *self, const char **alleles)
9090
static int
9191
variant_init_samples_and_index_map(tsk_variant_t *self,
9292
const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples,
93-
size_t num_samples_alloc, tsk_flags_t options)
93+
size_t num_samples_alloc, tsk_flags_t TSK_UNUSED(options))
9494
{
9595
int ret = 0;
96-
const tsk_flags_t *flags = tree_sequence->tables->nodes.flags;
9796
tsk_size_t j, num_nodes;
98-
bool impute_missing = !!(options & TSK_ISOLATED_NOT_MISSING);
9997
tsk_id_t u;
10098

10199
num_nodes = tsk_treeseq_get_num_nodes(tree_sequence);
@@ -120,11 +118,6 @@ variant_init_samples_and_index_map(tsk_variant_t *self,
120118
ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE);
121119
goto out;
122120
}
123-
/* We can only detect missing data for samples */
124-
if (!impute_missing && !(flags[u] & TSK_NODE_IS_SAMPLE)) {
125-
ret = tsk_trace_error(TSK_ERR_MUST_IMPUTE_NON_SAMPLES);
126-
goto out;
127-
}
128121
self->alt_sample_index_map[samples[j]] = (tsk_id_t) j;
129122
}
130123
out:
@@ -218,15 +211,21 @@ tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence,
218211
}
219212
self->sample_index_map = self->alt_sample_index_map;
220213
}
221-
/* When a list of samples is given, we use the traversal based algorithm
222-
* which doesn't use sample list tracking in the tree */
214+
/* When a list of samples is given, we use the traversal based algorithm,
215+
* so we need traversal scratch space and presence tracking. */
223216
if (self->alt_samples != NULL) {
224217
self->traversal_stack = tsk_malloc(
225218
tsk_treeseq_get_num_nodes(tree_sequence) * sizeof(*self->traversal_stack));
226219
if (self->traversal_stack == NULL) {
227220
ret = tsk_trace_error(TSK_ERR_NO_MEMORY);
228221
goto out;
229222
}
223+
self->sample_is_present
224+
= tsk_malloc(num_samples_alloc * sizeof(*self->sample_is_present));
225+
if (self->sample_is_present == NULL) {
226+
ret = tsk_trace_error(TSK_ERR_NO_MEMORY);
227+
goto out;
228+
}
230229
}
231230

232231
self->genotypes = tsk_malloc(num_samples_alloc * sizeof(*self->genotypes));
@@ -274,6 +273,7 @@ tsk_variant_free(tsk_variant_t *self)
274273
tsk_safe_free(self->samples);
275274
tsk_safe_free(self->alt_samples);
276275
tsk_safe_free(self->alt_sample_index_map);
276+
tsk_safe_free(self->sample_is_present);
277277
tsk_safe_free(self->traversal_stack);
278278
return 0;
279279
}
@@ -425,13 +425,48 @@ tsk_variant_mark_missing(tsk_variant_t *self)
425425
const tsk_id_t *restrict sample_index_map = self->sample_index_map;
426426
const tsk_id_t N = self->tree.virtual_root;
427427
int32_t *restrict genotypes = self->genotypes;
428-
tsk_id_t root, sample_index;
428+
tsk_id_t root, sample_index, u, v;
429+
const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags;
430+
bool *restrict present = self->sample_is_present;
431+
tsk_id_t *restrict stack = self->traversal_stack;
432+
int stack_top = -1;
433+
tsk_size_t j = 0;
434+
435+
if (self->alt_samples == NULL) {
436+
for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) {
437+
if (left_child[root] == TSK_NULL) {
438+
sample_index = sample_index_map[root];
439+
if (sample_index != TSK_NULL) {
440+
genotypes[sample_index] = TSK_MISSING_DATA;
441+
num_missing++;
442+
}
443+
}
444+
}
445+
} else {
446+
tsk_memset(present, 0, self->num_samples * sizeof(*present));
447+
448+
for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) {
449+
stack[++stack_top] = root;
450+
}
429451

430-
for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) {
431-
if (left_child[root] == TSK_NULL) {
432-
sample_index = sample_index_map[root];
452+
while (stack_top >= 0) {
453+
u = stack[stack_top--];
454+
sample_index = sample_index_map[u];
433455
if (sample_index != TSK_NULL) {
434-
genotypes[sample_index] = TSK_MISSING_DATA;
456+
present[sample_index] = true;
457+
}
458+
for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) {
459+
stack[++stack_top] = v;
460+
}
461+
}
462+
463+
for (j = 0; j < self->num_samples; j++) {
464+
u = self->samples[j];
465+
if (!present[j]
466+
|| ((flags[u] & TSK_NODE_IS_SAMPLE) != 0
467+
&& self->tree.parent[u] == TSK_NULL
468+
&& left_child[u] == TSK_NULL)) {
469+
genotypes[j] = TSK_MISSING_DATA;
435470
num_missing++;
436471
}
437472
}
@@ -591,6 +626,7 @@ tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other)
591626
other->sample_index_map = NULL;
592627
other->alt_samples = NULL;
593628
other->alt_sample_index_map = NULL;
629+
other->sample_is_present = NULL;
594630
other->user_alleles_mem = NULL;
595631

596632
total_len = 0;

c/tskit/genotypes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ typedef struct {
7070
tsk_id_t *samples;
7171

7272
const tsk_id_t *sample_index_map;
73+
bool *sample_is_present;
7374
bool user_alleles;
7475
char *user_alleles_mem;
7576
tsk_id_t *traversal_stack;

python/tests/test_genotypes.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1283,8 +1283,8 @@ def test_empty_samples(self):
12831283

12841284
def test_non_sample_samples(self):
12851285
ts = self.ts()
1286-
with pytest.raises(tskit.LibraryError, match="MUST_IMPUTE_NON_SAMPLES"):
1287-
list(ts.alignments(samples=[4]))
1286+
alignment = list(ts.alignments(samples=[4]))
1287+
assert alignment == ["NNANNNNNNT"]
12881288

12891289
def test_alignments_missing_data_char(self):
12901290
A = list(self.ts().alignments(missing_data_character="x"))
@@ -1308,6 +1308,23 @@ def test_alignments_reference_sequence_embedded_null(self):
13081308
assert A[1] == "01A3\x005678C"
13091309
assert A[2] == "01A3\x005678C"
13101310

1311+
def test_internal_node_missing_alignments(self):
1312+
tables = tskit.TableCollection(sequence_length=2)
1313+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
1314+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
1315+
tables.nodes.add_row(time=1)
1316+
tables.nodes.add_row(time=1)
1317+
tables.edges.add_row(left=0, right=1, parent=2, child=0)
1318+
tables.edges.add_row(left=0, right=1, parent=2, child=1)
1319+
tables.edges.add_row(left=1, right=2, parent=3, child=0)
1320+
tables.edges.add_row(left=1, right=2, parent=3, child=1)
1321+
tables.sites.add_row(position=0, ancestral_state="A")
1322+
tables.sites.add_row(position=1, ancestral_state="C")
1323+
ts = tables.tree_sequence()
1324+
alignments = list(ts.alignments(samples=[2, 3]))
1325+
assert alignments[0] == "AN"
1326+
assert alignments[1] == "NC"
1327+
13111328
def test_fasta_default(self):
13121329
expected = textwrap.dedent(
13131330
"""\

0 commit comments

Comments
 (0)