Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 88 additions & 3 deletions c/tests/test_genotypes.c
Original file line number Diff line number Diff line change
Expand Up @@ -512,10 +512,48 @@ test_single_tree_non_samples(void)

tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL,
single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0);
/* It's an error to hand in non-samples without imputation turned on */
ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUST_IMPUTE_NON_SAMPLES);
tsk_vargen_free(&vargen);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_vargen_print_state(&vargen, _devnull);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_FALSE(var->has_missing_data);
CU_ASSERT_EQUAL(var->genotypes[0], 0);
CU_ASSERT_EQUAL(var->genotypes[1], 0);
CU_ASSERT_EQUAL(var->num_alleles, 2);
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
CU_ASSERT_EQUAL(var->site.id, 0);
CU_ASSERT_EQUAL(var->site.mutations_length, 1);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_FALSE(var->has_missing_data);
CU_ASSERT_EQUAL(var->genotypes[1], 0);
CU_ASSERT_EQUAL(var->genotypes[0], 1);
CU_ASSERT_EQUAL(var->num_alleles, 2);
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
CU_ASSERT_EQUAL(var->site.id, 1);
CU_ASSERT_EQUAL(var->site.mutations_length, 2);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_FALSE(var->has_missing_data);
CU_ASSERT_EQUAL(var->genotypes[0], 0);
CU_ASSERT_EQUAL(var->genotypes[1], 0);
CU_ASSERT_EQUAL(var->num_alleles, 2);
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
CU_ASSERT_EQUAL(var->site.id, 2);
CU_ASSERT_EQUAL(var->site.mutations_length, 4);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_vargen_free(&vargen);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, TSK_ISOLATED_NOT_MISSING);
CU_ASSERT_EQUAL_FATAL(ret, 0);
Expand Down Expand Up @@ -559,6 +597,52 @@ test_single_tree_non_samples(void)
tsk_treeseq_free(&ts);
}

static void
test_internal_node_missing_data(void)
{
int ret = 0;
const char *nodes = "1 0 -1 -1\n"
"1 0 -1 -1\n"
"0 1 -1 -1\n"
"0 1 -1 -1\n";
const char *edges = "0 1 2 0\n"
"0 1 2 1\n"
"1 2 3 0\n"
"1 2 3 1\n";
const char *sites = "0.5 0\n"
"1.5 0\n";
tsk_treeseq_t ts;
tsk_vargen_t vargen;
tsk_variant_t *var;
tsk_id_t samples[] = { 2, 3 };

tsk_treeseq_from_text(&ts, 2, nodes, edges, NULL, sites, NULL, NULL, NULL, 0);

ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_TRUE(var->has_missing_data);
CU_ASSERT_EQUAL(var->site.id, 0);
CU_ASSERT_EQUAL(var->genotypes[0], 0);
CU_ASSERT_EQUAL(var->genotypes[1], TSK_MISSING_DATA);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_TRUE(var->has_missing_data);
CU_ASSERT_EQUAL(var->site.id, 1);
CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA);
CU_ASSERT_EQUAL(var->genotypes[1], 0);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_vargen_free(&vargen);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_treeseq_free(&ts);
}

static void
test_single_tree_errors(void)
{
Expand Down Expand Up @@ -1123,6 +1207,7 @@ main(int argc, char **argv)
{ "test_single_tree_char_alphabet", test_single_tree_char_alphabet },
{ "test_single_tree_binary_alphabet", test_single_tree_binary_alphabet },
{ "test_single_tree_non_samples", test_single_tree_non_samples },
{ "test_internal_node_missing_data", test_internal_node_missing_data },
{ "test_single_tree_errors", test_single_tree_errors },
{ "test_single_tree_user_alleles_errors", test_single_tree_user_alleles_errors },
{ "test_single_tree_subsample", test_single_tree_subsample },
Expand Down
4 changes: 0 additions & 4 deletions c/tskit/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -569,10 +569,6 @@ tsk_strerror_internal(int err)
break;

/* Genotype decoding errors */
case TSK_ERR_MUST_IMPUTE_NON_SAMPLES:
ret = "Cannot generate genotypes for non-samples when isolated nodes are "
"considered as missing. (TSK_ERR_MUST_IMPUTE_NON_SAMPLES)";
break;
case TSK_ERR_ALLELE_NOT_FOUND:
ret = "An allele was not found in the user-specified allele map. "
"(TSK_ERR_ALLELE_NOT_FOUND)";
Expand Down
6 changes: 0 additions & 6 deletions c/tskit/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -785,12 +785,6 @@ than 0.
@{
*/
/**
Genotypes were requested for non-samples at the same time
as asking that isolated nodes be marked as missing. This is not
supported.
*/
#define TSK_ERR_MUST_IMPUTE_NON_SAMPLES -1100
/**
A user-specified allele map was used, but didn't contain an allele
found in the tree sequence.
*/
Expand Down
95 changes: 79 additions & 16 deletions c/tskit/genotypes.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,10 @@ tsk_variant_copy_alleles(tsk_variant_t *self, const char **alleles)
static int
variant_init_samples_and_index_map(tsk_variant_t *self,
const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples,
size_t num_samples_alloc, tsk_flags_t options)
size_t num_samples_alloc, tsk_flags_t TSK_UNUSED(options))
{
int ret = 0;
const tsk_flags_t *flags = tree_sequence->tables->nodes.flags;
tsk_size_t j, num_nodes;
bool impute_missing = !!(options & TSK_ISOLATED_NOT_MISSING);
tsk_id_t u;

num_nodes = tsk_treeseq_get_num_nodes(tree_sequence);
Expand All @@ -120,11 +118,6 @@ variant_init_samples_and_index_map(tsk_variant_t *self,
ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE);
goto out;
}
/* We can only detect missing data for samples */
if (!impute_missing && !(flags[u] & TSK_NODE_IS_SAMPLE)) {
ret = tsk_trace_error(TSK_ERR_MUST_IMPUTE_NON_SAMPLES);
goto out;
}
self->alt_sample_index_map[samples[j]] = (tsk_id_t) j;
}
out:
Expand All @@ -141,6 +134,7 @@ tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence,
tsk_size_t num_samples_alloc;

tsk_memset(self, 0, sizeof(tsk_variant_t));
self->missingmess_cache_tree_index = TSK_NULL;

/* Set site id to NULL to indicate the variant is not decoded */
self->site.id = TSK_NULL;
Expand Down Expand Up @@ -218,12 +212,14 @@ tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence,
}
self->sample_index_map = self->alt_sample_index_map;
}
/* When a list of samples is given, we use the traversal based algorithm
* which doesn't use sample list tracking in the tree */
/* When a list of samples is given, we use the traversal based algorithm,
* so we need traversal scratch space and presence tracking. */
if (self->alt_samples != NULL) {
self->traversal_stack = tsk_malloc(
tsk_treeseq_get_num_nodes(tree_sequence) * sizeof(*self->traversal_stack));
if (self->traversal_stack == NULL) {
self->sample_is_present
= tsk_malloc(num_samples_alloc * sizeof(*self->sample_is_present));
if (self->traversal_stack == NULL || self->sample_is_present == NULL) {
ret = tsk_trace_error(TSK_ERR_NO_MEMORY);
goto out;
}
Expand Down Expand Up @@ -274,6 +270,7 @@ tsk_variant_free(tsk_variant_t *self)
tsk_safe_free(self->samples);
tsk_safe_free(self->alt_samples);
tsk_safe_free(self->alt_sample_index_map);
tsk_safe_free(self->sample_is_present);
tsk_safe_free(self->traversal_stack);
return 0;
}
Expand Down Expand Up @@ -416,6 +413,55 @@ tsk_variant_update_genotypes_traversal(
return tsk_variant_traverse(self, node, derived, tsk_variant_visit);
}

static void
tsk_variant_update_missing_cache(tsk_variant_t *self)
{
const tsk_id_t *restrict left_child = self->tree.left_child;
const tsk_id_t *restrict right_sib = self->tree.right_sib;
const tsk_id_t *restrict sample_index_map = self->sample_index_map;
const tsk_id_t N = self->tree.virtual_root;
const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags;
bool *restrict present = self->sample_is_present;
tsk_id_t *restrict stack = self->traversal_stack;
tsk_id_t root, u, v, sample_index;
tsk_size_t j;
int stack_top;

tsk_bug_assert(self->alt_samples != NULL);
tsk_bug_assert(self->tree.index != TSK_NULL);

tsk_memset(present, 0, self->num_samples * sizeof(*present));

stack_top = -1;
for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) {
stack_top++;
stack[stack_top] = root;
}

while (stack_top >= 0) {
u = stack[stack_top];
stack_top--;
sample_index = sample_index_map[u];
if (sample_index != TSK_NULL) {
present[sample_index] = true;
}
for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) {
stack_top++;
stack[stack_top] = v;
}
}

for (j = 0; j < self->num_samples; j++) {
u = self->samples[j];
present[j]
= present[j]
&& !((flags[u] & TSK_NODE_IS_SAMPLE) != 0
&& self->tree.parent[u] == TSK_NULL && left_child[u] == TSK_NULL);
}

self->missingmess_cache_tree_index = self->tree.index;
}

static tsk_size_t
tsk_variant_mark_missing(tsk_variant_t *self)
{
Expand All @@ -426,16 +472,27 @@ tsk_variant_mark_missing(tsk_variant_t *self)
const tsk_id_t N = self->tree.virtual_root;
int32_t *restrict genotypes = self->genotypes;
tsk_id_t root, sample_index;
tsk_size_t j;

for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) {
if (left_child[root] == TSK_NULL) {
sample_index = sample_index_map[root];
if (sample_index != TSK_NULL) {
genotypes[sample_index] = TSK_MISSING_DATA;
if (self->alt_samples == NULL) {
for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) {
if (left_child[root] == TSK_NULL) {
sample_index = sample_index_map[root];
if (sample_index != TSK_NULL) {
genotypes[sample_index] = TSK_MISSING_DATA;
num_missing++;
}
}
}
} else {
for (j = 0; j < self->num_samples; j++) {
if (!self->sample_is_present[j]) {
genotypes[j] = TSK_MISSING_DATA;
num_missing++;
}
}
}

return num_missing;
}

Expand Down Expand Up @@ -536,6 +593,10 @@ tsk_variant_decode(
* mutations directly over samples should not be missing */
num_missing = 0;
if (!impute_missing) {
if (self->alt_samples != NULL
&& self->missingmess_cache_tree_index != self->tree.index) {
tsk_variant_update_missing_cache(self);
}
num_missing = mark_missing(self);
}
for (j = 0; j < self->site.mutations_length; j++) {
Expand Down Expand Up @@ -591,7 +652,9 @@ tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other)
other->sample_index_map = NULL;
other->alt_samples = NULL;
other->alt_sample_index_map = NULL;
other->sample_is_present = NULL;
other->user_alleles_mem = NULL;
other->missingmess_cache_tree_index = TSK_NULL;

total_len = 0;
for (j = 0; j < self->num_alleles; j++) {
Expand Down
2 changes: 2 additions & 0 deletions c/tskit/genotypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,14 @@ typedef struct {
tsk_id_t *samples;

const tsk_id_t *sample_index_map;
bool *sample_is_present;
bool user_alleles;
char *user_alleles_mem;
tsk_id_t *traversal_stack;
tsk_flags_t options;
tsk_id_t *alt_samples;
tsk_id_t *alt_sample_index_map;
tsk_id_t missingmess_cache_tree_index;

} tsk_variant_t;

Expand Down
21 changes: 19 additions & 2 deletions python/tests/test_genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1283,8 +1283,8 @@ def test_empty_samples(self):

def test_non_sample_samples(self):
ts = self.ts()
with pytest.raises(tskit.LibraryError, match="MUST_IMPUTE_NON_SAMPLES"):
list(ts.alignments(samples=[4]))
alignment = list(ts.alignments(samples=[4]))
assert alignment == ["NNANNNNNNT"]

def test_alignments_missing_data_char(self):
A = list(self.ts().alignments(missing_data_character="x"))
Expand All @@ -1308,6 +1308,23 @@ def test_alignments_reference_sequence_embedded_null(self):
assert A[1] == "01A3\x005678C"
assert A[2] == "01A3\x005678C"

def test_internal_node_missing_alignments(self):
tables = tskit.TableCollection(sequence_length=2)
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
tables.nodes.add_row(time=1)
tables.nodes.add_row(time=1)
tables.edges.add_row(left=0, right=1, parent=2, child=0)
tables.edges.add_row(left=0, right=1, parent=2, child=1)
tables.edges.add_row(left=1, right=2, parent=3, child=0)
tables.edges.add_row(left=1, right=2, parent=3, child=1)
tables.sites.add_row(position=0, ancestral_state="A")
tables.sites.add_row(position=1, ancestral_state="C")
ts = tables.tree_sequence()
alignments = list(ts.alignments(samples=[2, 3]))
assert alignments[0] == "AN"
assert alignments[1] == "NC"

def test_fasta_default(self):
expected = textwrap.dedent(
"""\
Expand Down