Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 88 additions & 3 deletions c/tests/test_genotypes.c
Original file line number Diff line number Diff line change
Expand Up @@ -512,10 +512,48 @@ test_single_tree_non_samples(void)

tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL,
single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0);
/* It's an error to hand in non-samples without imputation turned on */
ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUST_IMPUTE_NON_SAMPLES);
tsk_vargen_free(&vargen);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_vargen_print_state(&vargen, _devnull);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_FALSE(var->has_missing_data);
CU_ASSERT_EQUAL(var->genotypes[0], 0);
CU_ASSERT_EQUAL(var->genotypes[1], 0);
CU_ASSERT_EQUAL(var->num_alleles, 2);
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
CU_ASSERT_EQUAL(var->site.id, 0);
CU_ASSERT_EQUAL(var->site.mutations_length, 1);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_FALSE(var->has_missing_data);
CU_ASSERT_EQUAL(var->genotypes[1], 0);
CU_ASSERT_EQUAL(var->genotypes[0], 1);
CU_ASSERT_EQUAL(var->num_alleles, 2);
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
CU_ASSERT_EQUAL(var->site.id, 1);
CU_ASSERT_EQUAL(var->site.mutations_length, 2);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_FALSE(var->has_missing_data);
CU_ASSERT_EQUAL(var->genotypes[0], 0);
CU_ASSERT_EQUAL(var->genotypes[1], 0);
CU_ASSERT_EQUAL(var->num_alleles, 2);
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
CU_ASSERT_EQUAL(var->site.id, 2);
CU_ASSERT_EQUAL(var->site.mutations_length, 4);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_vargen_free(&vargen);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, TSK_ISOLATED_NOT_MISSING);
CU_ASSERT_EQUAL_FATAL(ret, 0);
Expand Down Expand Up @@ -559,6 +597,52 @@ test_single_tree_non_samples(void)
tsk_treeseq_free(&ts);
}

static void
test_internal_node_missing_data(void)
{
int ret = 0;
const char *nodes = "1 0 -1 -1\n"
"1 0 -1 -1\n"
"0 1 -1 -1\n"
"0 1 -1 -1\n";
const char *edges = "0 1 2 0\n"
"0 1 2 1\n"
"1 2 3 0\n"
"1 2 3 1\n";
const char *sites = "0.5 0\n"
"1.5 0\n";
tsk_treeseq_t ts;
tsk_vargen_t vargen;
tsk_variant_t *var;
tsk_id_t samples[] = { 2, 3 };

tsk_treeseq_from_text(&ts, 2, nodes, edges, NULL, sites, NULL, NULL, NULL, 0);

ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_TRUE(var->has_missing_data);
CU_ASSERT_EQUAL(var->site.id, 0);
CU_ASSERT_EQUAL(var->genotypes[0], 0);
CU_ASSERT_EQUAL(var->genotypes[1], TSK_MISSING_DATA);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_TRUE(var->has_missing_data);
CU_ASSERT_EQUAL(var->site.id, 1);
CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA);
CU_ASSERT_EQUAL(var->genotypes[1], 0);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_vargen_free(&vargen);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_treeseq_free(&ts);
}

static void
test_single_tree_errors(void)
{
Expand Down Expand Up @@ -1123,6 +1207,7 @@ main(int argc, char **argv)
{ "test_single_tree_char_alphabet", test_single_tree_char_alphabet },
{ "test_single_tree_binary_alphabet", test_single_tree_binary_alphabet },
{ "test_single_tree_non_samples", test_single_tree_non_samples },
{ "test_internal_node_missing_data", test_internal_node_missing_data },
{ "test_single_tree_errors", test_single_tree_errors },
{ "test_single_tree_user_alleles_errors", test_single_tree_user_alleles_errors },
{ "test_single_tree_subsample", test_single_tree_subsample },
Expand Down
4 changes: 0 additions & 4 deletions c/tskit/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -569,10 +569,6 @@ tsk_strerror_internal(int err)
break;

/* Genotype decoding errors */
case TSK_ERR_MUST_IMPUTE_NON_SAMPLES:
ret = "Cannot generate genotypes for non-samples when isolated nodes are "
"considered as missing. (TSK_ERR_MUST_IMPUTE_NON_SAMPLES)";
break;
case TSK_ERR_ALLELE_NOT_FOUND:
ret = "An allele was not found in the user-specified allele map. "
"(TSK_ERR_ALLELE_NOT_FOUND)";
Expand Down
6 changes: 0 additions & 6 deletions c/tskit/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -785,12 +785,6 @@ than 0.
@{
*/
/**
Genotypes were requested for non-samples at the same time
as asking that isolated nodes be marked as missing. This is not
supported.
*/
#define TSK_ERR_MUST_IMPUTE_NON_SAMPLES -1100
/**
A user-specified allele map was used, but didn't contain an allele
found in the tree sequence.
*/
Expand Down
66 changes: 51 additions & 15 deletions c/tskit/genotypes.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,10 @@ tsk_variant_copy_alleles(tsk_variant_t *self, const char **alleles)
static int
variant_init_samples_and_index_map(tsk_variant_t *self,
const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples,
size_t num_samples_alloc, tsk_flags_t options)
size_t num_samples_alloc, tsk_flags_t TSK_UNUSED(options))
{
int ret = 0;
const tsk_flags_t *flags = tree_sequence->tables->nodes.flags;
tsk_size_t j, num_nodes;
bool impute_missing = !!(options & TSK_ISOLATED_NOT_MISSING);
tsk_id_t u;

num_nodes = tsk_treeseq_get_num_nodes(tree_sequence);
Expand All @@ -120,11 +118,6 @@ variant_init_samples_and_index_map(tsk_variant_t *self,
ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE);
goto out;
}
/* We can only detect missing data for samples */
if (!impute_missing && !(flags[u] & TSK_NODE_IS_SAMPLE)) {
ret = tsk_trace_error(TSK_ERR_MUST_IMPUTE_NON_SAMPLES);
goto out;
}
self->alt_sample_index_map[samples[j]] = (tsk_id_t) j;
}
out:
Expand Down Expand Up @@ -218,15 +211,21 @@ tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence,
}
self->sample_index_map = self->alt_sample_index_map;
}
/* When a list of samples is given, we use the traversal based algorithm
* which doesn't use sample list tracking in the tree */
/* When a list of samples is given, we use the traversal based algorithm,
* so we need traversal scratch space and presence tracking. */
if (self->alt_samples != NULL) {
self->traversal_stack = tsk_malloc(
tsk_treeseq_get_num_nodes(tree_sequence) * sizeof(*self->traversal_stack));
if (self->traversal_stack == NULL) {
ret = tsk_trace_error(TSK_ERR_NO_MEMORY);
goto out;
}
self->sample_is_present
= tsk_malloc(num_samples_alloc * sizeof(*self->sample_is_present));
if (self->sample_is_present == NULL) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

usual practise is to bundle multiple mem error checks together

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

ret = tsk_trace_error(TSK_ERR_NO_MEMORY);
goto out;
}
}

self->genotypes = tsk_malloc(num_samples_alloc * sizeof(*self->genotypes));
Expand Down Expand Up @@ -274,6 +273,7 @@ tsk_variant_free(tsk_variant_t *self)
tsk_safe_free(self->samples);
tsk_safe_free(self->alt_samples);
tsk_safe_free(self->alt_sample_index_map);
tsk_safe_free(self->sample_is_present);
tsk_safe_free(self->traversal_stack);
return 0;
}
Expand Down Expand Up @@ -425,13 +425,48 @@ tsk_variant_mark_missing(tsk_variant_t *self)
const tsk_id_t *restrict sample_index_map = self->sample_index_map;
const tsk_id_t N = self->tree.virtual_root;
int32_t *restrict genotypes = self->genotypes;
tsk_id_t root, sample_index;
tsk_id_t root, sample_index, u, v;
const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags;
bool *restrict present = self->sample_is_present;
tsk_id_t *restrict stack = self->traversal_stack;
int stack_top = -1;
tsk_size_t j = 0;

if (self->alt_samples == NULL) {
for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) {
if (left_child[root] == TSK_NULL) {
sample_index = sample_index_map[root];
if (sample_index != TSK_NULL) {
genotypes[sample_index] = TSK_MISSING_DATA;
num_missing++;
}
}
}
} else {
tsk_memset(present, 0, self->num_samples * sizeof(*present));

for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) {
stack[++stack_top] = root;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't use return value of increment operators, separate into two statements. Please use the existing patterns for these kinds of traversals so I don't have to think about it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

}

for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) {
if (left_child[root] == TSK_NULL) {
sample_index = sample_index_map[root];
while (stack_top >= 0) {
u = stack[stack_top--];
sample_index = sample_index_map[u];
if (sample_index != TSK_NULL) {
genotypes[sample_index] = TSK_MISSING_DATA;
present[sample_index] = true;
}
for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) {
stack[++stack_top] = v;
}
}

for (j = 0; j < self->num_samples; j++) {
u = self->samples[j];
if (!present[j]
|| ((flags[u] & TSK_NODE_IS_SAMPLE) != 0
&& self->tree.parent[u] == TSK_NULL
&& left_child[u] == TSK_NULL)) {
genotypes[j] = TSK_MISSING_DATA;
num_missing++;
}
}
Expand Down Expand Up @@ -591,6 +626,7 @@ tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other)
other->sample_index_map = NULL;
other->alt_samples = NULL;
other->alt_sample_index_map = NULL;
other->sample_is_present = NULL;
other->user_alleles_mem = NULL;

total_len = 0;
Expand Down
1 change: 1 addition & 0 deletions c/tskit/genotypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ typedef struct {
tsk_id_t *samples;

const tsk_id_t *sample_index_map;
bool *sample_is_present;
bool user_alleles;
char *user_alleles_mem;
tsk_id_t *traversal_stack;
Expand Down
21 changes: 19 additions & 2 deletions python/tests/test_genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1283,8 +1283,8 @@ def test_empty_samples(self):

def test_non_sample_samples(self):
ts = self.ts()
with pytest.raises(tskit.LibraryError, match="MUST_IMPUTE_NON_SAMPLES"):
list(ts.alignments(samples=[4]))
alignment = list(ts.alignments(samples=[4]))
assert alignment == ["NNANNNNNNT"]

def test_alignments_missing_data_char(self):
A = list(self.ts().alignments(missing_data_character="x"))
Expand All @@ -1308,6 +1308,23 @@ def test_alignments_reference_sequence_embedded_null(self):
assert A[1] == "01A3\x005678C"
assert A[2] == "01A3\x005678C"

def test_internal_node_missing_alignments(self):
tables = tskit.TableCollection(sequence_length=2)
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
tables.nodes.add_row(time=1)
tables.nodes.add_row(time=1)
tables.edges.add_row(left=0, right=1, parent=2, child=0)
tables.edges.add_row(left=0, right=1, parent=2, child=1)
tables.edges.add_row(left=1, right=2, parent=3, child=0)
tables.edges.add_row(left=1, right=2, parent=3, child=1)
tables.sites.add_row(position=0, ancestral_state="A")
tables.sites.add_row(position=1, ancestral_state="C")
ts = tables.tree_sequence()
alignments = list(ts.alignments(samples=[2, 3]))
assert alignments[0] == "AN"
assert alignments[1] == "NC"

def test_fasta_default(self):
expected = textwrap.dedent(
"""\
Expand Down