adding intialization fixes

coursekevin · coursekevin · commit 8dcaa44f4a82 · 2023-11-06T22:17:45.000-05:00
diff --git a/variationalsparsebayes/svi_half_cauchy.py b/variationalsparsebayes/svi_half_cauchy.py
@@ -20,10 +20,9 @@ def __init__(self, mu_init: Tensor, log_sigma_init: Tensor) -> None:
         # check that the input shapes are the same
         assert len(mu_init) == len(log_sigma_init), "init shapes must be equal."
         assert mu_init.shape == log_sigma_init.shape, "init shapes must be equal."
-        self.d = len(mu_init)
         self.total_weights = len(mu_init)
         # saving parameters
-        self.sparse_index = torch.ones(self.d).bool()
+        self.register_buffer("sparse_index", torch.ones(len(mu_init)).bool())
 
         self.mu = mu_init
         self.log_sigma = log_sigma_init
@@ -44,6 +43,10 @@ def log_sigma(self) -> Tensor:
     def log_sigma(self, value: Tensor):
         self.__log_sigma = Parameter(value)
 
+    @property
+    def d(self) -> int:
+        return int(self.sparse_index.sum())
+
     def var(self) -> Tensor:
         return torch.exp(self.log_sigma).pow(2)
 
@@ -52,7 +55,6 @@ def update_sparse_index(self, sparse_index: Tensor) -> None:
             len(sparse_index) == self.total_weights
         ), "Sparse index should be a bool array masking unimportant weights."
         self.sparse_index = sparse_index
-        self.d = int(self.sparse_index.sum())
 
     def forward(self, n: int) -> Tensor:
         """
@@ -65,7 +67,7 @@ def forward(self, n: int) -> Tensor:
             Tensor: (n,d) reparameterized samples from variational distribution
         """
         sigma = torch.exp(self.log_sigma)
-        return self.mu + torch.randn(n, self.d) * sigma
+        return self.mu + torch.randn(n, len(sigma), device=sigma.device) * sigma
 
 
 class LogNormalMeanFieldVariational(NormalMeanFieldVariational):
@@ -95,9 +97,9 @@ def forward(self, n: int) -> Tensor:
 
 class SVIHalfCauchyPrior(nn.Module):
     """
-    Class for performing sparse Bayesian learning using stochastic variational inference. This class provides 
-    utilities for generating reparameterized samples from the variational distribution and computing the 
-    KL-divergence between the variational distribution and the prior exactly     
+    Class for performing sparse Bayesian learning using stochastic variational inference. This class provides
+    utilities for generating reparameterized samples from the variational distribution and computing the
+    KL-divergence between the variational distribution and the prior exactly
 
     Args:
         d (int): number of parameters
@@ -116,26 +118,52 @@ class SVIHalfCauchyPrior(nn.Module):
 
     def __init__(self, d: int, tau: Union[Tensor, float], w_init: Tensor = None):
         super().__init__()
+        # fixing gamma parameterization mixup
+        tau = 1 / math.sqrt(tau)
         if isinstance(tau, float):
             tau = torch.tensor(tau)
         self.register_buffer("tau", tau)
+        tau_data = torch.tensor(
+            [1.0, 1 / 1e-1, 1 / 1e-2, 1 / 1e-3, 1 / 1e-4, 1 / 1e-5]
+        ).log()
+        tau_data = torch.stack([torch.ones(6), tau_data], dim=-1)
+        mu_data = torch.tensor(
+            [-1.6932, -6.2983, -10.9035, -15.5087, -20.1138, -24.7190]
+        )
+        # scale_data = (tau_data.log() @ mu_data) / (tau_data.log() @ tau_data.log())
+        scale = torch.linalg.solve(tau_data.t() @ tau_data, (tau_data.t() @ mu_data))
+        if w_init is not None:
+            assert len(w_init) == d, "w_init must be a vector of length d."
+            global_scale_init = -1.6931 * torch.ones(1)
+            noise = -5.0
+        else:
+            # initializing such that kl-divergence is minimized
+            global_scale_init = scale[0] + scale[1] * tau.log() * torch.ones(1)
+            noise = 0.3466
+        # global_scale_init = -1.6931 * torch.ones(1)
+        # data fit for tau
         self.s_a = LogNormalMeanFieldVariational(
-            torch.zeros(1), -6.0 + torch.randn(1) * 1e-4
+            global_scale_init, noise + torch.randn(1) * 1e-4
         )
         self.s_b = LogNormalMeanFieldVariational(
-            torch.zeros(1), -6.0 + torch.randn(1) * 1e-4
+            1.6931 * torch.ones(1), noise + torch.randn(1) * 1e-4
         )
         self.gamma_a = LogNormalMeanFieldVariational(
-            torch.zeros(d), -6.0 + torch.randn(d) * 1e-4
+            -1.6931 * torch.ones(d), noise + torch.randn(d) * 1e-4
         )
         self.gamma_b = LogNormalMeanFieldVariational(
-            torch.zeros(d), -6.0 + torch.randn(d) * 1e-4
+            +1.6931 * torch.ones(d), noise + torch.randn(d) * 1e-4
         )
         if w_init is None:
-            w_init = torch.randn(d)
-        self.w_tilde = NormalMeanFieldVariational(w_init, -6.0 + torch.randn(d) * 1e-4)
-        self.register_buffer("sparse_index", torch.arange(d))
-        self.pruning_tol = 0.0
+            w_init = torch.zeros(d)
+            w_tilde_noise = -0.0
+        else:
+            w_tilde_noise = -6.0
+        self.w_tilde = NormalMeanFieldVariational(
+            w_init, w_tilde_noise + torch.randn(d) * 1e-6
+        )
+        self.register_buffer("sparse_index", torch.ones(d, dtype=torch.bool))
+        self.register_buffer("purning_tol", torch.tensor(0.0))
 
     def _log_normal_reparam(
         self,
@@ -158,15 +186,15 @@ def _log_normal_reparam(
             log_sigma_b (Tensor): log stdev of r.v. b
 
         Returns:
-            Tensor: (n,d) reparmeterized samples 
+            Tensor: (n,d) reparmeterized samples
         """
         mu = 0.5 * (mu_a + mu_b)
         var = 0.25 * (torch.exp(log_sigma_a).pow(2) + torch.exp(log_sigma_b).pow(2))
-        return torch.exp(mu + torch.randn(n, d) * var.sqrt())
+        return torch.exp(mu + torch.randn(n, d, device=mu.device) * var.sqrt())
 
     def get_reparam_weights(self, n: int) -> Tensor:
         """
-        Generate reparameterized samples 
+        Generate reparameterized samples
 
         Args:
             n (int): number of reparam samples
@@ -228,7 +256,7 @@ def kl_divergence(self) -> Tensor:
         Computes the KL divergence for the approximating posteriors
 
         Returns:
-            Tensor: kl divergence 
+            Tensor: kl divergence
         """
         kl_sa = self._kl_s_a()
         kl_sb = self._kl_s_b()
@@ -240,7 +268,7 @@ def kl_divergence(self) -> Tensor:
     def _compute_sparsity_tolerance(self, negative_log_mode: Tensor) -> Tensor:
         """
         Provides a reasonable pruning tolerance using the mid range of the
-        negative log modes 
+        negative log modes
 
         Args:
             negative_log_mode (Tensor): negative log mode of the weight est
@@ -252,9 +280,9 @@ def _compute_sparsity_tolerance(self, negative_log_mode: Tensor) -> Tensor:
 
     def update_sparse_index(self) -> Tensor:
         """
-        Updates the sparse_index by pruning based on the negative log-mode 
+        Updates the sparse_index by pruning based on the negative log-mode
 
-        Returns: 
+        Returns:
             Tensor: negative log mode for each parameter
         """
         mu_zi = 0.5 * (self.s_a.mu + self.s_b.mu + self.gamma_a.mu + self.gamma_b.mu)
@@ -263,9 +291,19 @@ def update_sparse_index(self) -> Tensor:
         )
         negative_log_mode = var_zi - mu_zi
         self.pruning_tol = self._compute_sparsity_tolerance(negative_log_mode)
-        self.sparse_index = negative_log_mode <= self.pruning_tol
+        self.sparse_index = (negative_log_mode <= self.pruning_tol).cpu()
+        self._propogate_sparse_index(self.sparse_index)
+        return -negative_log_mode + self.pruning_tol
+
+    def reset_sparse_index(self) -> None:
+        """
+        Updates the sparse_index by pruning based on the negative log-mode
+
+        Returns:
+            Tensor: negative log mode for each parameter
+        """
+        self.sparse_index = torch.ones(len(self.sparse_index), dtype=torch.bool)
         self._propogate_sparse_index(self.sparse_index)
-        return negative_log_mode
 
     def _propogate_sparse_index(self, sparse_index) -> None:
         """
@@ -280,4 +318,3 @@ def _propogate_sparse_index(self, sparse_index) -> None:
     print(2.0)
     svi = SVIHalfCauchyPrior(10, torch.tensor(1.0))
     print(svi.get_reparam_weights(20).shape)
-