|
1 | | -from io import StringIO |
| 1 | +""" |
| 2 | +Tests for genotype imputation (forward and Baum-Welsh algorithms). |
| 3 | +""" |
| 4 | +import io |
2 | 5 |
|
3 | 6 | import numpy as np |
4 | 7 | import pandas as pd |
5 | 8 |
|
6 | 9 | import tskit |
7 | 10 |
|
8 | 11 |
|
9 | | -""" |
10 | | -A tree sequence containing 3 diploid individuals with 5 sites and 5 mutations |
11 | | -(one per site). The first 2 individuals are used as reference panel, |
12 | | -the last one is the target individual. |
13 | | -""" |
| 12 | +# A tree sequence containing 3 diploid individuals with 5 sites and 5 mutations |
| 13 | +# (one per site). The first 2 individuals are used as reference panel, |
| 14 | +# the last one is the target individual. |
| 15 | + |
14 | 16 | _toy_ts_nodes_text = """\ |
15 | 17 | id is_sample time population individual metadata |
16 | 18 | 0 1 0.000000 0 0 |
@@ -81,41 +83,40 @@ def get_toy_ts(): |
81 | 83 | Returns the toy tree sequence in text format above. |
82 | 84 | """ |
83 | 85 | ts = tskit.load_text( |
84 | | - nodes=StringIO(_toy_ts_nodes_text), |
85 | | - edges=StringIO(_toy_ts_edges_text), |
86 | | - sites=StringIO(_toy_ts_sites_text), |
87 | | - mutations=StringIO(_toy_ts_mutations_text), |
88 | | - individuals=StringIO(_toy_ts_individuals_text), |
| 86 | + nodes=io.StringIO(_toy_ts_nodes_text), |
| 87 | + edges=io.StringIO(_toy_ts_edges_text), |
| 88 | + sites=io.StringIO(_toy_ts_sites_text), |
| 89 | + mutations=io.StringIO(_toy_ts_mutations_text), |
| 90 | + individuals=io.StringIO(_toy_ts_individuals_text), |
89 | 91 | strict=False, |
90 | 92 | ) |
91 | 93 | return ts |
92 | 94 |
|
93 | 95 |
|
94 | | -""" |
95 | | -BEAGLE 4.1 was run on the toy data set above using default parameters. |
96 | | -The following are the forward probability matrices and backward probability |
97 | | -matrices calculated when imputing into the third individual above. There are |
98 | | -two sets of matrices, one for each haplotype. |
99 | | -
|
100 | | -Notes about calculations: |
101 | | -n = number of haplotypes in ref. panel |
102 | | -M = number of markers |
103 | | -m = index of marker (site) |
104 | | -h = index of haplotype in ref. panel |
105 | | -
|
106 | | -In forward probability matrix, |
107 | | - fwd[m][h] = emission prob., if m = 0 (first marker) |
108 | | - fwd[m][h] = emission prob. * (scale * fwd[m - 1][h] + shift), otherwise |
109 | | - where scale = (1 - switch prob.)/sum of fwd[m - 1], |
110 | | - and shift = switch prob./n. |
111 | | -
|
112 | | -In backward probability matrix, |
113 | | - bwd[m][h] = 1, if m = M - 1 (last marker) // DON'T SEE THIS IN BEAGLE |
114 | | - unadj. bwd[m][h] = emission prob. / n |
115 | | - bwd[m][h] = (unadj. bwd[m][h] + shift) * scale, otherwise |
116 | | - where scale = (1 - switch prob.)/sum of unadj. bwd[m], |
117 | | - and shift = switch prob./n. |
118 | | -""" |
| 96 | +# BEAGLE 4.1 was run on the toy data set above using default parameters. |
| 97 | +# The following are the forward probability matrices and backward probability |
| 98 | +# matrices calculated when imputing into the third individual above. There are |
| 99 | +# two sets of matrices, one for each haplotype. |
| 100 | +# |
| 101 | +# Notes about calculations: |
| 102 | +# n = number of haplotypes in ref. panel |
| 103 | +# M = number of markers |
| 104 | +# m = index of marker (site) |
| 105 | +# h = index of haplotype in ref. panel |
| 106 | +# |
| 107 | +# In forward probability matrix, |
| 108 | +# fwd[m][h] = emission prob., if m = 0 (first marker) |
| 109 | +# fwd[m][h] = emission prob. * (scale * fwd[m - 1][h] + shift), otherwise |
| 110 | +# where scale = (1 - switch prob.)/sum of fwd[m - 1], |
| 111 | +# and shift = switch prob./n. |
| 112 | +# |
| 113 | +# In backward probability matrix, |
| 114 | +# bwd[m][h] = 1, if m = M - 1 (last marker) // DON'T SEE THIS IN BEAGLE |
| 115 | +# unadj. bwd[m][h] = emission prob. / n |
| 116 | +# bwd[m][h] = (unadj. bwd[m][h] + shift) * scale, otherwise |
| 117 | +# where scale = (1 - switch prob.)/sum of unadj. bwd[m], |
| 118 | +# and shift = switch prob./n. |
| 119 | + |
119 | 120 | _fwd_matrix_text_1 = """ |
120 | 121 | m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shift,scale,sum,val |
121 | 122 | 0,0,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,0.000100,0.000100 |
@@ -201,7 +202,7 @@ def convert_to_pd_df(matrix_text): |
201 | 202 | """ |
202 | 203 | Converts a matrix in text to a Pandas dataframe and returns it. |
203 | 204 | """ |
204 | | - df = pd.read_csv(StringIO(matrix_text)) |
| 205 | + df = pd.read_csv(io.StringIO(matrix_text)) |
205 | 206 | for i in np.arange(df.shape[0]): |
206 | 207 | # Check that switch and non-switch probabilities sum to 1 |
207 | 208 | assert df.probRec[i] + df.probNoRec[i] == 1 or np.isnan(df.probRec[i]) |
|
0 commit comments