@@ -4,7 +4,7 @@ jupytext:
44 extension : .md
55 format_name : myst
66 format_version : 0.13
7- jupytext_version : 1.17.2
7+ jupytext_version : 1.17.1
88kernelspec :
99 display_name : Python 3 (ipykernel)
1010 language : python
@@ -57,6 +57,7 @@ from scipy.optimize import brentq, minimize_scalar
5757from scipy.stats import beta as beta_dist
5858import pandas as pd
5959from IPython.display import display, Math
60+ import quantecon as qe
6061```
6162
6263## Likelihood Ratio Process
@@ -1764,6 +1765,260 @@ From the figure above, we can see:
17641765
17651766**Remark:** Think about how laws of large numbers are applied to compute error probabilities for the model selection problem and the classification problem.
17661767
1768+ ## Special case: Markov chain models
1769+
1770+ Consider two $n$-state irreducible and aperiodic Markov chain models on the same state space $\{1, 2, \ldots, n\}$ with positive transition matrices $P^{(f)}$, $P^{(g)}$ and initial distributions $\pi_0^{(f)}$, $\pi_0^{(g)}$.
1771+
1772+ In this section, we assume nature chooses $f$.
1773+
1774+ For a sample path $(x_0, x_1, \ldots, x_T)$, let $N_{ij}$ count transitions from state $i$ to $j$.
1775+
1776+ The likelihood under model $m \in \{f, g\}$ is
1777+
1778+ $$
1779+ L_T^{(m)} = \pi_ {0,x_0}^{(m)} \prod_ {i=1}^n \prod_ {j=1}^n \left(P_ {ij}^{(m)}\right)^{N_ {ij}}
1780+ $$
1781+
1782+ Hence,
1783+
1784+ $$
1785+ \log L_T^{(m)} =\log\pi_ {0,x_0}^{(m)} +\sum_ {i,j}N_ {ij}\log P_ {ij}^{(m)}
1786+ $$
1787+
1788+ The log-likelihood ratio is
1789+
1790+ $$
1791+ \log \frac{L_T^{(f)}}{L_T^{(g)}} = \log \frac{\pi_ {0,x_0}^{(f)}}{\pi_ {0,x_0}^{(g)}} + \sum_ {i,j}N_ {ij}\log \frac{P_ {ij}^{(f)}}{P_ {ij}^{(g)}}
1792+ $$ (eq:llr_markov)
1793+
1794+ ### KL divergence rate
1795+
1796+ By the ergodic theorem for irreducible, aperiodic Markov chains, we have
1797+
1798+ $$
1799+ \frac{N_ {ij}}{T} \xrightarrow{a.s.} \pi_i^{(f)}P_ {ij}^{(f)} \quad \text{as } T \to \infty
1800+ $$
1801+
1802+ where $\boldsymbol{\pi}^{(f)}$ is the stationary distribution satisfying $\boldsymbol{\pi}^{(f)} = \boldsymbol{\pi}^{(f)} P^{(f)}$.
1803+
1804+ Therefore,
1805+
1806+ $$
1807+ \frac{1}{T}\log \frac{L_T^{(f)}}{L_T^{(g)}} = \frac{1}{T}\log \frac{\pi_ {0,x_0}^{(f)}}{\pi_ {0,x_0}^{(g)}} + \frac{1}{T}\sum_ {i,j}N_ {ij}\log \frac{P_ {ij}^{(f)}}{P_ {ij}^{(g)}}
1808+ $$
1809+
1810+ Taking the limit as $T \to \infty$, we have
1811+ - The first term: $\frac{1}{T}\log \frac{\pi_{0,x_0}^{(f)}}{\pi_{0,x_0}^{(g)}} \to 0$
1812+ - The second term: $\frac{1}{T}\sum_{i,j}N_{ij}\log \frac{P_{ij}^{(f)}}{P_{ij}^{(g)}} \xrightarrow{a.s.} \sum_{i,j}\pi_i^{(f)}P_{ij}^{(f)}\log \frac{P_{ij}^{(f)}}{P_{ij}^{(g)}}$
1813+
1814+ Define the **KL divergence rate** as
1815+
1816+ $$
1817+ h_ {KL}(f, g) = \sum_ {i=1}^n \pi_i^{(f)} \underbrace{\sum_ {j=1}^n P_ {ij}^{(f)} \log \frac{P_ {ij}^{(f)}}{P_ {ij}^{(g)}}}_ {=: KL(P_ {i\cdot}^{(f)}, P_ {i\cdot}^{(g)})}
1818+ $$
1819+
1820+ where $KL(P_{i\cdot}^{(f)}, P_{i\cdot}^{(g)})$ is the row-wise KL divergence.
1821+
1822+
1823+ By the ergodic theorem, we have
1824+
1825+ $$
1826+ \frac{1}{T}\log \frac{L_T^{(f)}}{L_T^{(g)}} \xrightarrow{a.s.} h_ {KL}(f, g) \quad \text{as } T \to \infty
1827+ $$
1828+
1829+ Taking expectations and using the dominated convergence theorem, we can get
1830+
1831+ $$
1832+ \frac{1}{T}E_f\left[ \log \frac{L_T^{(f)}}{L_T^{(g)}}\right] \to h_ {KL}(f, g) \quad \text{as } T \to \infty
1833+ $$
1834+
1835+ Here we invite readers to pause and compare this result with {eq}`eq:kl_likelihood_link`.
1836+
1837+ Let's confirm this in the simulation below.
1838+
1839+ ### Simulations
1840+
1841+ Let's implement simulations to illustrate these concepts with a three-state Markov chain.
1842+
1843+ We start with writing out functions to compute the stationary distribution and the KL divergence rate for Markov chain models
1844+
1845+ ```{code-cell} ipython3
1846+ def compute_stationary_dist(P):
1847+ """
1848+ Compute stationary distribution of transition matrix P
1849+ """
1850+
1851+ eigenvalues, eigenvectors = np.linalg.eig(P.T)
1852+ idx = np.argmax(np.abs(eigenvalues))
1853+ stationary = np.real(eigenvectors[:, idx])
1854+ return stationary / stationary.sum()
1855+
1856+ def markov_kl_divergence(P_f, P_g, pi_f):
1857+ """
1858+ Compute KL divergence rate between two Markov chains
1859+ """
1860+ if np.any((P_f > 0) & (P_g == 0)):
1861+ return np.inf
1862+
1863+ valid_mask = (P_f > 0) & (P_g > 0)
1864+
1865+ log_ratios = np.zeros_like(P_f)
1866+ log_ratios[valid_mask] = np.log(P_f[valid_mask] / P_g[valid_mask])
1867+
1868+ # Weight by stationary probabilities and sum
1869+ kl_rate = np.sum(pi_f[:, np.newaxis] * P_f * log_ratios)
1870+
1871+ return kl_rate
1872+
1873+ def simulate_markov_chain(P, pi_0, T, N_paths=1000):
1874+ """
1875+ Simulate N_paths sample paths from a Markov chain
1876+ """
1877+ mc = qe.MarkovChain(P, state_values=None)
1878+
1879+ initial_states = np.random.choice(len(P), size=N_paths, p=pi_0)
1880+
1881+ paths = np.zeros((N_paths, T+1), dtype=int)
1882+
1883+ for i in range(N_paths):
1884+ path = mc.simulate(T+1, init=initial_states[i])
1885+ paths[i, :] = path
1886+
1887+ return paths
1888+
1889+
1890+ def compute_likelihood_ratio_markov(paths, P_f, P_g, π_0_f, π_0_g):
1891+ """
1892+ Compute likelihood ratio process for Markov chain paths
1893+ """
1894+ N_paths, T_plus_1 = paths.shape
1895+ T = T_plus_1 - 1
1896+ L_ratios = np.ones((N_paths, T+1))
1897+
1898+ L_ratios[:, 0] = π_0_f[paths[:, 0]] / π_0_g[paths[:, 0]]
1899+
1900+ for t in range(1, T+1):
1901+ prev_states = paths[:, t-1]
1902+ curr_states = paths[:, t]
1903+
1904+ transition_ratios = P_f[prev_states, curr_states] \
1905+ / P_g[prev_states, curr_states]
1906+ L_ratios[:, t] = L_ratios[:, t-1] * transition_ratios
1907+
1908+ return L_ratios
1909+ ```
1910+
1911+ Now let's create an example with two different 3-state Markov chains
1912+
1913+ ```{code-cell} ipython3
1914+ P_f = np.array([[0.7, 0.2, 0.1],
1915+ [0.3, 0.5, 0.2],
1916+ [0.1, 0.3, 0.6]])
1917+
1918+ P_g = np.array([[0.5, 0.3, 0.2],
1919+ [0.2, 0.6, 0.2],
1920+ [0.2, 0.2, 0.6]])
1921+
1922+ # Compute stationary distributions
1923+ π_f = compute_stationary_dist(P_f)
1924+ π_g = compute_stationary_dist(P_g)
1925+
1926+ print(f"Stationary distribution (f): {π_f}")
1927+ print(f"Stationary distribution (g): {π_g}")
1928+
1929+ # Compute KL divergence rate
1930+ kl_rate_fg = markov_kl_divergence(P_f, P_g, π_f)
1931+ kl_rate_gf = markov_kl_divergence(P_g, P_f, π_g)
1932+
1933+ print(f"\nKL divergence rate h(f, g): {kl_rate_fg:.4f}")
1934+ print(f"KL divergence rate h(g, f): {kl_rate_gf:.4f}")
1935+ ```
1936+
1937+ We are now ready to simulate paths and visualize how likelihood ratios evolve.
1938+
1939+ We'll verify $\frac{1}{T}E_f\left[\log \frac{L_T^{(f)}}{L_T^{(g)}}\right] = h_{KL}(f, g)$ starting from the stationary distribution by plotting both the empirical average and the line predicted by the theory
1940+
1941+ ```{code-cell} ipython3
1942+ T = 500
1943+ N_paths = 1000
1944+ paths_from_f = simulate_markov_chain(P_f, π_f, T, N_paths)
1945+
1946+ L_ratios_f = compute_likelihood_ratio_markov(paths_from_f,
1947+ P_f, P_g,
1948+ π_f, π_g)
1949+
1950+ plt.figure(figsize=(10, 6))
1951+
1952+ # Plot individual paths
1953+ n_show = 50
1954+ for i in range(n_show):
1955+ plt.plot(np.log(L_ratios_f[i, :]),
1956+ alpha=0.3, color='blue', lw=0.8)
1957+
1958+ # Compute theoretical expectation
1959+ theory_line = kl_rate_fg * np.arange(T+1)
1960+ plt.plot(theory_line, 'k--', linewidth=2.5,
1961+ label=r'$T \times h_{KL}(f,g)$')
1962+
1963+ # Compute empirical mean
1964+ avg_log_L = np.mean(np.log(L_ratios_f), axis=0)
1965+ plt.plot(avg_log_L, 'r-', linewidth=2.5,
1966+ label='empirical average', alpha=0.5)
1967+
1968+ plt.axhline(y=0, color='gray',
1969+ linestyle='--', alpha=0.5)
1970+ plt.xlabel(r'$T$')
1971+ plt.ylabel(r'$\log L_T$')
1972+ plt.title('nature = $f$')
1973+ plt.legend()
1974+ plt.show()
1975+ ```
1976+
1977+ Let's examine how the model selection error probability depends on sample size using the same simulation strategy in the previous section
1978+
1979+ ```{code-cell} ipython3
1980+ def compute_selection_error(T_values, P_f, P_g, π_0_f, π_0_g, N_sim=1000):
1981+ """
1982+ Compute model selection error probability for different sample sizes
1983+ """
1984+ errors = []
1985+
1986+ for T in T_values:
1987+ # Simulate from both models
1988+ paths_f = simulate_markov_chain(P_f, π_0_f, T, N_sim//2)
1989+ paths_g = simulate_markov_chain(P_g, π_0_g, T, N_sim//2)
1990+
1991+ # Compute likelihood ratios
1992+ L_f = compute_likelihood_ratio_markov(paths_f,
1993+ P_f, P_g,
1994+ π_0_f, π_0_g)
1995+ L_g = compute_likelihood_ratio_markov(paths_g,
1996+ P_f, P_g,
1997+ π_0_f, π_0_g)
1998+
1999+ # Decision rule: choose f if L_T >= 1
2000+ error_f = np.mean(L_f[:, -1] < 1) # Type I error
2001+ error_g = np.mean(L_g[:, -1] >= 1) # Type II error
2002+
2003+ total_error = 0.5 * (error_f + error_g)
2004+ errors.append(total_error)
2005+
2006+ return np.array(errors)
2007+
2008+ # Compute error probabilities
2009+ T_values = np.arange(10, 201, 10)
2010+ errors = compute_selection_error(T_values,
2011+ P_f, P_g,
2012+ π_f, π_g)
2013+
2014+ # Plot results
2015+ plt.figure(figsize=(10, 6))
2016+ plt.plot(T_values, errors, linewidth=2)
2017+ plt.xlabel('$T$')
2018+ plt.ylabel('error probability')
2019+ plt.show()
2020+ ```
2021+
17672022## Measuring discrepancies between distributions
17682023
17692024A plausible guess is that the ability of a likelihood ratio to distinguish distributions $f$ and $g$ depends on how "different" they are.
24052660
24062661```{solution-end}
24072662```
2408-
2409-
2410-
0 commit comments