Tidy up

szhan · szhan · commit 53f20ad6b33d · 2023-08-07T22:53:47.000+01:00
diff --git a/python/tests/test_imputation.py b/python/tests/test_imputation.py
@@ -1,16 +1,18 @@
-from io import StringIO
+"""
+Tests for genotype imputation (forward and Baum-Welsh algorithms).
+"""
+import io
 
 import numpy as np
 import pandas as pd
 
 import tskit
 
 
-"""
-A tree sequence containing 3 diploid individuals with 5 sites and 5 mutations
-(one per site). The first 2 individuals are used as reference panel,
-the last one is the target individual.
-"""
+# A tree sequence containing 3 diploid individuals with 5 sites and 5 mutations
+# (one per site). The first 2 individuals are used as reference panel,
+# the last one is the target individual.
+
 _toy_ts_nodes_text = """\
 id      is_sample       time    population      individual      metadata
 0       1       0.000000        0       0
@@ -81,41 +83,40 @@ def get_toy_ts():
     Returns the toy tree sequence in text format above.
     """
     ts = tskit.load_text(
-        nodes=StringIO(_toy_ts_nodes_text),
-        edges=StringIO(_toy_ts_edges_text),
-        sites=StringIO(_toy_ts_sites_text),
-        mutations=StringIO(_toy_ts_mutations_text),
-        individuals=StringIO(_toy_ts_individuals_text),
+        nodes=io.StringIO(_toy_ts_nodes_text),
+        edges=io.StringIO(_toy_ts_edges_text),
+        sites=io.StringIO(_toy_ts_sites_text),
+        mutations=io.StringIO(_toy_ts_mutations_text),
+        individuals=io.StringIO(_toy_ts_individuals_text),
         strict=False,
     )
     return ts
 
 
-"""
-BEAGLE 4.1 was run on the toy data set above using default parameters.
-The following are the forward probability matrices and backward probability
-matrices calculated when imputing into the third individual above. There are
-two sets of matrices, one for each haplotype.
-
-Notes about calculations:
-n = number of haplotypes in ref. panel
-M = number of markers
-m = index of marker (site)
-h = index of haplotype in ref. panel
-
-In forward probability matrix,
-    fwd[m][h] = emission prob., if m = 0 (first marker)
-    fwd[m][h] = emission prob. * (scale * fwd[m - 1][h] + shift), otherwise
-    where scale = (1 - switch prob.)/sum of fwd[m - 1],
-        and shift = switch prob./n.
-
-In backward probability matrix,
-    bwd[m][h] = 1, if m = M - 1 (last marker)   // DON'T SEE THIS IN BEAGLE
-    unadj. bwd[m][h] = emission prob. / n
-    bwd[m][h] = (unadj. bwd[m][h] + shift) * scale, otherwise
-    where scale = (1 - switch prob.)/sum of unadj. bwd[m],
-        and shift = switch prob./n.
-"""
+# BEAGLE 4.1 was run on the toy data set above using default parameters.
+# The following are the forward probability matrices and backward probability
+# matrices calculated when imputing into the third individual above. There are
+# two sets of matrices, one for each haplotype.
+#
+# Notes about calculations:
+# n = number of haplotypes in ref. panel
+# M = number of markers
+# m = index of marker (site)
+# h = index of haplotype in ref. panel
+#
+# In forward probability matrix,
+#    fwd[m][h] = emission prob., if m = 0 (first marker)
+#    fwd[m][h] = emission prob. * (scale * fwd[m - 1][h] + shift), otherwise
+#    where scale = (1 - switch prob.)/sum of fwd[m - 1],
+#        and shift = switch prob./n.
+#
+# In backward probability matrix,
+#    bwd[m][h] = 1, if m = M - 1 (last marker)   // DON'T SEE THIS IN BEAGLE
+#    unadj. bwd[m][h] = emission prob. / n
+#    bwd[m][h] = (unadj. bwd[m][h] + shift) * scale, otherwise
+#    where scale = (1 - switch prob.)/sum of unadj. bwd[m],
+#        and shift = switch prob./n.
+
 _fwd_matrix_text_1 = """
 m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shift,scale,sum,val
 0,0,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,0.000100,0.000100
@@ -201,7 +202,7 @@ def convert_to_pd_df(matrix_text):
     """
     Converts a matrix in text to a Pandas dataframe and returns it.
     """
-    df = pd.read_csv(StringIO(matrix_text))
+    df = pd.read_csv(io.StringIO(matrix_text))
     for i in np.arange(df.shape[0]):
         # Check that switch and non-switch probabilities sum to 1
         assert df.probRec[i] + df.probNoRec[i] == 1 or np.isnan(df.probRec[i])