Skip to content

Commit e73bb7b

Browse files
committed
Allow internal nodes when isolate_as_missing
1 parent dacbf38 commit e73bb7b

File tree

6 files changed

+476
-63
lines changed

6 files changed

+476
-63
lines changed

c/tests/test_genotypes.c

Lines changed: 93 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -512,10 +512,45 @@ test_single_tree_non_samples(void)
512512

513513
tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL,
514514
single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0);
515-
/* It's an error to hand in non-samples without imputation turned on */
516515
ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0);
517-
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUST_IMPUTE_NON_SAMPLES);
518-
tsk_vargen_free(&vargen);
516+
CU_ASSERT_EQUAL_FATAL(ret, 0);
517+
tsk_vargen_print_state(&vargen, _devnull);
518+
519+
ret = tsk_vargen_next(&vargen, &var);
520+
CU_ASSERT_EQUAL_FATAL(ret, 1);
521+
CU_ASSERT_EQUAL(var->genotypes[0], 0);
522+
CU_ASSERT_EQUAL(var->genotypes[1], 0);
523+
CU_ASSERT_EQUAL(var->num_alleles, 2);
524+
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
525+
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
526+
CU_ASSERT_EQUAL(var->site.id, 0);
527+
CU_ASSERT_EQUAL(var->site.mutations_length, 1);
528+
529+
ret = tsk_vargen_next(&vargen, &var);
530+
CU_ASSERT_EQUAL_FATAL(ret, 1);
531+
CU_ASSERT_EQUAL(var->genotypes[1], 0);
532+
CU_ASSERT_EQUAL(var->genotypes[0], 1);
533+
CU_ASSERT_EQUAL(var->num_alleles, 2);
534+
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
535+
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
536+
CU_ASSERT_EQUAL(var->site.id, 1);
537+
CU_ASSERT_EQUAL(var->site.mutations_length, 2);
538+
539+
ret = tsk_vargen_next(&vargen, &var);
540+
CU_ASSERT_EQUAL_FATAL(ret, 1);
541+
CU_ASSERT_EQUAL(var->genotypes[0], 0);
542+
CU_ASSERT_EQUAL(var->genotypes[1], 0);
543+
CU_ASSERT_EQUAL(var->num_alleles, 2);
544+
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
545+
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
546+
CU_ASSERT_EQUAL(var->site.id, 2);
547+
CU_ASSERT_EQUAL(var->site.mutations_length, 4);
548+
549+
ret = tsk_vargen_next(&vargen, &var);
550+
CU_ASSERT_EQUAL_FATAL(ret, 0);
551+
552+
ret = tsk_vargen_free(&vargen);
553+
CU_ASSERT_EQUAL_FATAL(ret, 0);
519554

520555
ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, TSK_ISOLATED_NOT_MISSING);
521556
CU_ASSERT_EQUAL_FATAL(ret, 0);
@@ -559,6 +594,60 @@ test_single_tree_non_samples(void)
559594
tsk_treeseq_free(&ts);
560595
}
561596

597+
static void
598+
test_isolated_internal_node(void)
599+
{
600+
int ret = 0;
601+
tsk_treeseq_t ts;
602+
tsk_vargen_t vargen;
603+
tsk_variant_t *var;
604+
/* Two sample nodes (0,1), plus an internal non-sample node u=2 with no edges */
605+
const char *nodes = "1 0 -1 -1\n"
606+
"1 0 -1 -1\n"
607+
"0 1 -1 -1\n";
608+
const char *sites = "2.0 A\n"
609+
"9.0 T\n";
610+
tsk_id_t samples[] = { 2 };
611+
612+
tsk_treeseq_from_text(&ts, 10, nodes, "", NULL, sites, NULL, NULL, NULL, 0);
613+
CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_nodes(&ts), 3);
614+
CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_samples(&ts), 2);
615+
CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_sites(&ts), 2);
616+
617+
/* Default options (isolated_as_missing=True): internal node is isolated everywhere
618+
*/
619+
ret = tsk_vargen_init(&vargen, &ts, samples, 1, NULL, 0);
620+
CU_ASSERT_EQUAL_FATAL(ret, 0);
621+
ret = tsk_vargen_next(&vargen, &var);
622+
CU_ASSERT_EQUAL_FATAL(ret, 1);
623+
CU_ASSERT_TRUE(var->has_missing_data);
624+
CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA);
625+
ret = tsk_vargen_next(&vargen, &var);
626+
CU_ASSERT_EQUAL_FATAL(ret, 1);
627+
CU_ASSERT_TRUE(var->has_missing_data);
628+
CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA);
629+
ret = tsk_vargen_next(&vargen, &var);
630+
CU_ASSERT_EQUAL_FATAL(ret, 0);
631+
tsk_vargen_free(&vargen);
632+
633+
/* Impute missing (isolated_as_missing=False): genotypes should be ancestral (0) */
634+
ret = tsk_vargen_init(&vargen, &ts, samples, 1, NULL, TSK_ISOLATED_NOT_MISSING);
635+
CU_ASSERT_EQUAL_FATAL(ret, 0);
636+
ret = tsk_vargen_next(&vargen, &var);
637+
CU_ASSERT_EQUAL_FATAL(ret, 1);
638+
CU_ASSERT_FALSE(var->has_missing_data);
639+
CU_ASSERT_EQUAL(var->genotypes[0], 0);
640+
ret = tsk_vargen_next(&vargen, &var);
641+
CU_ASSERT_EQUAL_FATAL(ret, 1);
642+
CU_ASSERT_FALSE(var->has_missing_data);
643+
CU_ASSERT_EQUAL(var->genotypes[0], 0);
644+
ret = tsk_vargen_next(&vargen, &var);
645+
CU_ASSERT_EQUAL_FATAL(ret, 0);
646+
tsk_vargen_free(&vargen);
647+
648+
tsk_treeseq_free(&ts);
649+
}
650+
562651
static void
563652
test_single_tree_errors(void)
564653
{
@@ -1123,6 +1212,7 @@ main(int argc, char **argv)
11231212
{ "test_single_tree_char_alphabet", test_single_tree_char_alphabet },
11241213
{ "test_single_tree_binary_alphabet", test_single_tree_binary_alphabet },
11251214
{ "test_single_tree_non_samples", test_single_tree_non_samples },
1215+
{ "test_isolated_internal_node", test_isolated_internal_node },
11261216
{ "test_single_tree_errors", test_single_tree_errors },
11271217
{ "test_single_tree_user_alleles_errors", test_single_tree_user_alleles_errors },
11281218
{ "test_single_tree_subsample", test_single_tree_subsample },

c/tskit/genotypes.c

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,10 @@ tsk_variant_copy_alleles(tsk_variant_t *self, const char **alleles)
9090
static int
9191
variant_init_samples_and_index_map(tsk_variant_t *self,
9292
const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples,
93-
size_t num_samples_alloc, tsk_flags_t options)
93+
size_t num_samples_alloc, tsk_flags_t TSK_UNUSED(options))
9494
{
9595
int ret = 0;
96-
const tsk_flags_t *flags = tree_sequence->tables->nodes.flags;
9796
tsk_size_t j, num_nodes;
98-
bool impute_missing = !!(options & TSK_ISOLATED_NOT_MISSING);
9997
tsk_id_t u;
10098

10199
num_nodes = tsk_treeseq_get_num_nodes(tree_sequence);
@@ -120,11 +118,6 @@ variant_init_samples_and_index_map(tsk_variant_t *self,
120118
ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE);
121119
goto out;
122120
}
123-
/* We can only detect missing data for samples */
124-
if (!impute_missing && !(flags[u] & TSK_NODE_IS_SAMPLE)) {
125-
ret = tsk_trace_error(TSK_ERR_MUST_IMPUTE_NON_SAMPLES);
126-
goto out;
127-
}
128121
self->alt_sample_index_map[samples[j]] = (tsk_id_t) j;
129122
}
130123
out:
@@ -439,6 +432,27 @@ tsk_variant_mark_missing(tsk_variant_t *self)
439432
return num_missing;
440433
}
441434

435+
/* Mark missing for any requested node (sample or non-sample) that is isolated
436+
* in the current tree, i.e., has no parent and no children at this position. */
437+
static tsk_size_t
438+
tsk_variant_mark_missing_any(tsk_variant_t *self)
439+
{
440+
tsk_size_t num_missing = 0;
441+
int32_t *restrict genotypes = self->genotypes;
442+
const tsk_id_t *restrict parent = self->tree.parent;
443+
const tsk_id_t *restrict left_child = self->tree.left_child;
444+
tsk_size_t j;
445+
446+
for (j = 0; j < self->num_samples; j++) {
447+
tsk_id_t u = self->samples[j];
448+
if (parent[u] == TSK_NULL && left_child[u] == TSK_NULL) {
449+
genotypes[j] = TSK_MISSING_DATA;
450+
num_missing++;
451+
}
452+
}
453+
return num_missing;
454+
}
455+
442456
static tsk_id_t
443457
tsk_variant_get_allele_index(tsk_variant_t *self, const char *allele, tsk_size_t length)
444458
{
@@ -502,6 +516,10 @@ tsk_variant_decode(
502516
update_genotypes = tsk_variant_update_genotypes_sample_list;
503517
if (by_traversal) {
504518
update_genotypes = tsk_variant_update_genotypes_traversal;
519+
/* When decoding a user-provided list of nodes (which may include
520+
* non-samples), mark isolated nodes as missing directly by checking
521+
* isolation status for each requested node. */
522+
mark_missing = tsk_variant_mark_missing_any;
505523
}
506524

507525
if (self->user_alleles) {

0 commit comments

Comments
 (0)