High loss when using bayesian lstm instead of standard lstm

I am trying to implement a model using the bayesian lstm layer given I already have a model that relies on lstm and it gets good results for a classification task.
When I use the bayesian layer the loss becomes very high and the accuracy doesn't converge much. I tried changing the model's hyperparameters (especially prior variables and posterior_rho) but didn't that much. I also added sharpen=True for loss sharpening but nothing changed.

The model:
```
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
##### Bayesian version #####
from layers.lstm_bayesian_layer import BayesianLSTM
from blitz.utils import variational_estimator
from layers.linear_bayesian_layer import BayesianLinear

from layers.attention import Attention, NoQueryAttention
from layers.squeeze_embedding import SqueezeEmbedding

@variational_estimator
class LSTM_BAYES_RNN(nn.Module):
    def __init__(self, embedding_matrix, opt):
        super(LSTM_BAYES_RNN, self).__init__()
        self.lstm = BayesianLSTM(opt.embed_dim*2, opt.hidden_dim, bias=True, freeze = False,
                prior_sigma_1 = 5,
                prior_sigma_2 = 5,
                posterior_rho_init=1,
                sharpen=True)
                #  prior_pi = 1,
                #  posterior_mu_init = 0,
                #  posterior_rho_init = -6.0,
        self.opt = opt
        self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))
        self.squeeze_embedding = SqueezeEmbedding()
        # self.dense = BayesianLinear(opt.hidden_dim, opt.polarities_dim, bias=True, freeze = False, 
                          # prior_sigma_1 = 10, prior_sigma_2 = 10, posterior_rho_init  = 5 )
        self.attention = NoQueryAttention(opt.hidden_dim+opt.embed_dim, score_function='bi_linear')

        self.dense = nn.Linear(opt.hidden_dim, opt.polarities_dim)

    def forward(self, inputs):
        text_indices, aspect_indices = inputs[0], inputs[1]
        x_len = torch.sum(text_indices != 0, dim=-1)
        x_len_max = torch.max(x_len)
        aspect_len = torch.sum(aspect_indices != 0, dim=-1).float()

        x = self.embed(text_indices)
        x = self.squeeze_embedding(x, x_len)
        aspect = self.embed(aspect_indices)
        aspect_pool = torch.div(torch.sum(aspect, dim=1), aspect_len.unsqueeze(1))
        aspect = aspect_pool.unsqueeze(1).expand(-1, x_len_max, -1)
        x = torch.cat((aspect, x), dim=-1)

        h, (_, _) = self.lstm(x)
        ha = torch.cat((h, aspect), dim=-1)
        _, score = self.attention(ha)
        output = torch.squeeze(torch.bmm(score, h), dim=1)
        out = self.dense(output)
        return out
```
in the training I have
```
                # bayesian loss calculation 
                pi_weight = minibatch_weight(batch_idx=i_batch, num_batches=self.opt.batch_size)

                loss = self.model.sample_elbo(
                        inputs=inputs,
                        labels=targets,
                        criterion=nn.CrossEntropyLoss(),
                        sample_nbr=10,
                        # complexity_cost_weight=1/len(self.trainset))
                        complexity_cost_weight = pi_weight)
      
                ##################

                loss.backward()
                optimizer.step()

                # take 3 outputs per example
                outputs = torch.stack([self.model(inputs) for i in range(3)])
                preds = torch.mean(outputs, axis=0)
```

What's the problem here?


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

High loss when using bayesian lstm instead of standard lstm #100

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Uh oh!

High loss when using bayesian lstm instead of standard lstm #100

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions