Skip to content
This repository was archived by the owner on Oct 21, 2025. It is now read-only.

Commit 3689ea8

Browse files
Merge pull request #147 from theislab/dev
Dev
2 parents 00f8a83 + 6edaa1b commit 3689ea8

File tree

7 files changed

+228
-11
lines changed

7 files changed

+228
-11
lines changed

diffxpy/stats/stats.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import numpy.linalg
33
import scipy.sparse
44
import scipy.stats
5+
import sys
56
from typing import Union
67

78

@@ -263,16 +264,20 @@ def wald_test_chisq(
263264
raise ValueError('stats.wald_test(): theta_mle and theta0 have to contain the same number of entries')
264265

265266
theta_diff = theta_mle - theta0
266-
wald_statistic = np.array([
267+
invertible = np.where(np.linalg.cond(theta_covar, p=None) < 1 / sys.float_info.epsilon)[0]
268+
wald_statistic = np.zeros([theta_covar.shape[0]]) + np.nan
269+
wald_statistic[invertible] = np.array([
267270
np.matmul(
268271
np.matmul(
269272
theta_diff[:, [i]].T,
270273
numpy.linalg.inv(theta_covar[i, :, :])
271274
),
272275
theta_diff[:, [i]]
273276
)
274-
for i in range(theta_diff.shape[1])
277+
for i in invertible
275278
]).flatten()
279+
if invertible.shape[0] < theta_covar.shape[0]:
280+
print("caught %i linalg singular matrix errors" % (theta_covar.shape[0] - invertible.shape[0]))
276281
pvals = 1 - scipy.stats.chi2(theta_mle.shape[0]).cdf(wald_statistic)
277282
return pvals
278283

diffxpy/testing/tests.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,7 +1026,7 @@ def two_sample(
10261026
if noise_model is None:
10271027
raise ValueError("Please specify noise_model")
10281028
formula_loc = '~ 1 + grouping'
1029-
formula_scale = '~ 1 + grouping'
1029+
formula_scale = '~ 1'
10301030
de_test = wald(
10311031
data=data,
10321032
factor_loc_totest="grouping",
@@ -1038,8 +1038,6 @@ def two_sample(
10381038
sample_description=sample_description,
10391039
noise_model=noise_model,
10401040
size_factors=size_factors,
1041-
init_a="closed_form",
1042-
init_b="closed_form",
10431041
batch_size=batch_size,
10441042
backend=backend,
10451043
train_args=train_args,

diffxpy/testing/utils.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,3 +271,49 @@ def constraint_system_from_star(
271271
constraints=constraints,
272272
return_type=return_type
273273
)
274+
275+
276+
def bin_continuous_covariate(
277+
factor_to_bin: str,
278+
bins: Union[int, list, np.ndarray, Tuple],
279+
data: Union[None, anndata.AnnData] = None,
280+
sample_description: Union[None, pd.DataFrame] = None
281+
):
282+
r"""
283+
Bin a continuous covariate.
284+
285+
Adds the binned covariate to the table. If data is supplied, the covariate is added in place in data.obs, otherwise
286+
the covariate is added in the sample_description and the new sample_description is returned.
287+
Binning is performed on quantiles of the distribution.
288+
289+
:param factor_to_bin: Name of columns of factor to bin.
290+
:param bins: Number of bins or iteratable with bin borders. If given as integer, the bins are defined on the
291+
quantiles of the covariate, ie the bottom 20% of observations are in the first bin if bins==5.
292+
:param data: Anndata object that contains sample description table in .obs.
293+
:param sample_description: Sample description table.
294+
:return: Sample description table with binned covariate added if sample_description was supplied, otherwise None is
295+
returned as the new column was added in place.
296+
"""
297+
if data is None and sample_description is not None:
298+
sd = sample_description
299+
elif data is not None and sample_description is None:
300+
sd = data.obs
301+
else:
302+
raise ValueError("supply either data or sample_description")
303+
if isinstance(bins, list) or isinstance(bins, np.ndarray) or isinstance(bins, Tuple):
304+
bins = np.asarray(bins)
305+
else:
306+
bins = np.arange(0, 1, 1 / bins)
307+
308+
fac_binned = glm.data.bin_continuous_covariate(
309+
sample_description=sd,
310+
factor_to_bin=factor_to_bin,
311+
bins=bins
312+
)
313+
if data is None and sample_description is not None:
314+
sd[factor_to_bin + "_binned"] = fac_binned
315+
return sample_description
316+
elif data is not None and sample_description is None:
317+
data.obs[factor_to_bin + "_binned"] = fac_binned
318+
else:
319+
assert False
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
import logging
2+
import unittest
3+
import numpy as np
4+
import pandas as pd
5+
import scipy.stats as stats
6+
7+
import diffxpy.api as de
8+
9+
10+
class TestTwosample(unittest.TestCase):
11+
12+
def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
13+
"""
14+
Test if de.test_wald_loc() generates a uniform p-value distribution
15+
if it is given data simulated based on the null model. Returns the p-value
16+
of the two-side Kolmgorov-Smirnov test for equality of the observed
17+
p-value distriubution and a uniform distribution.
18+
19+
:param n_cells: Number of cells to simulate (number of observations per test).
20+
:param n_genes: Number of genes to simulate (number of tests).
21+
"""
22+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
23+
logging.getLogger("batchglm").setLevel(logging.WARNING)
24+
logging.getLogger("diffxpy").setLevel(logging.WARNING)
25+
from batchglm.api.models.numpy.glm_nb import Simulator
26+
27+
sim = Simulator(num_observations=n_cells, num_features=n_genes)
28+
sim.generate_sample_description(num_batches=0, num_conditions=0)
29+
sim.generate()
30+
31+
random_sample_description = pd.DataFrame({
32+
"condition": np.random.randint(n_groups, size=sim.nobs)
33+
})
34+
35+
test = de.test.two_sample(
36+
data=sim.input_data,
37+
grouping=random_sample_description["condition"].values,
38+
test="wald",
39+
noise_model="nb",
40+
)
41+
summary = test.summary()
42+
43+
# Compare p-value distribution under null model against uniform distribution.
44+
pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue
45+
46+
logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
47+
assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)
48+
49+
return True
50+
51+
def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
52+
"""
53+
Test if de.test_wald_loc() generates a uniform p-value distribution
54+
if it is given data simulated based on the null model. Returns the p-value
55+
of the two-side Kolmgorov-Smirnov test for equality of the observed
56+
p-value distriubution and a uniform distribution.
57+
58+
:param n_cells: Number of cells to simulate (number of observations per test).
59+
:param n_genes: Number of genes to simulate (number of tests).
60+
"""
61+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
62+
logging.getLogger("batchglm").setLevel(logging.WARNING)
63+
logging.getLogger("diffxpy").setLevel(logging.WARNING)
64+
from batchglm.api.models.numpy.glm_nb import Simulator
65+
66+
sim = Simulator(num_observations=n_cells, num_features=n_genes)
67+
sim.generate_sample_description(num_batches=0, num_conditions=0)
68+
sim.generate()
69+
70+
random_sample_description = pd.DataFrame({
71+
"condition": np.random.randint(n_groups, size=sim.nobs)
72+
})
73+
74+
test = de.test.two_sample(
75+
data=sim.input_data,
76+
grouping=random_sample_description["condition"],
77+
test="wald",
78+
noise_model="nb",
79+
)
80+
summary = test.summary()
81+
82+
# Compare p-value distribution under null model against uniform distribution.
83+
pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue
84+
85+
logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
86+
assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)
87+
88+
return True
89+
90+
def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
91+
"""
92+
Test if de.test_wald_loc() generates a uniform p-value distribution
93+
if it is given data simulated based on the null model. Returns the p-value
94+
of the two-side Kolmgorov-Smirnov test for equality of the observed
95+
p-value distriubution and a uniform distribution.
96+
97+
:param n_cells: Number of cells to simulate (number of observations per test).
98+
:param n_genes: Number of genes to simulate (number of tests).
99+
"""
100+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
101+
logging.getLogger("batchglm").setLevel(logging.WARNING)
102+
logging.getLogger("diffxpy").setLevel(logging.WARNING)
103+
from batchglm.api.models.numpy.glm_nb import Simulator
104+
105+
sim = Simulator(num_observations=n_cells, num_features=n_genes)
106+
sim.generate_sample_description(num_batches=0, num_conditions=0)
107+
sim.generate()
108+
109+
random_sample_description = pd.DataFrame({
110+
"condition": np.random.randint(n_groups, size=sim.nobs)
111+
})
112+
113+
test = de.test.two_sample(
114+
data=sim.input_data,
115+
grouping=random_sample_description["condition"],
116+
test="rank"
117+
)
118+
summary = test.summary()
119+
120+
# Compare p-value distribution under null model against uniform distribution.
121+
pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue
122+
123+
logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
124+
assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)
125+
126+
return True
127+
128+
def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
129+
"""
130+
Test if de.test_wald_loc() generates a uniform p-value distribution
131+
if it is given data simulated based on the null model. Returns the p-value
132+
of the two-side Kolmgorov-Smirnov test for equality of the observed
133+
p-value distriubution and a uniform distribution.
134+
135+
:param n_cells: Number of cells to simulate (number of observations per test).
136+
:param n_genes: Number of genes to simulate (number of tests).
137+
"""
138+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
139+
logging.getLogger("batchglm").setLevel(logging.WARNING)
140+
logging.getLogger("diffxpy").setLevel(logging.WARNING)
141+
from batchglm.api.models.numpy.glm_nb import Simulator
142+
143+
sim = Simulator(num_observations=n_cells, num_features=n_genes)
144+
sim.generate_sample_description(num_batches=0, num_conditions=0)
145+
sim.generate()
146+
147+
random_sample_description = pd.DataFrame({
148+
"condition": np.random.randint(n_groups, size=sim.nobs)
149+
})
150+
151+
test = de.test.two_sample(
152+
data=sim.input_data,
153+
grouping=random_sample_description["condition"],
154+
test="t_test"
155+
)
156+
summary = test.summary()
157+
158+
# Compare p-value distribution under null model against uniform distribution.
159+
pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue
160+
161+
logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
162+
assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)
163+
164+
return True
165+
166+
167+
if __name__ == '__main__':
168+
unittest.main()

diffxpy/unit_test/test_vsrest.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n
3333
})
3434

3535
test = de.test.versus_rest(
36-
data=sim.x,
36+
data=sim.input_data,
3737
grouping="condition",
3838
test="wald",
3939
noise_model="nb",
@@ -76,7 +76,7 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100):
7676
})
7777

7878
test = de.test.versus_rest(
79-
data=sim.x,
79+
data=sim.input_data,
8080
grouping="condition",
8181
test="lrt",
8282
noise_model="nb",
@@ -119,7 +119,7 @@ def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n
119119
})
120120

121121
test = de.test.versus_rest(
122-
data=sim.x,
122+
data=sim.input_data,
123123
grouping="condition",
124124
test="rank",
125125
sample_description=random_sample_description,
@@ -159,7 +159,7 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100,
159159
})
160160

161161
test = de.test.versus_rest(
162-
data=sim.x,
162+
data=sim.input_data,
163163
grouping="condition",
164164
test="t-test",
165165
sample_description=random_sample_description,

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ numpy>=1.14.0
22
scipy
33
pandas
44
patsy>=0.5.0
5-
batchglm>=0.7.3
5+
batchglm>=0.7.4
66
statsmodels
77
anndata
88
seaborn

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
'scipy>=1.2.1',
2222
'pandas',
2323
'patsy>=0.5.0',
24-
'batchglm>=0.7.3',
24+
'batchglm>=0.7.4',
2525
'statsmodels',
2626
],
2727
extras_require={

0 commit comments

Comments
 (0)