-
-
Notifications
You must be signed in to change notification settings - Fork 112
Open
Description
I am trying to implement a model using the bayesian lstm layer given I already have a model that relies on lstm and it gets good results for a classification task.
When I use the bayesian layer the loss becomes very high and the accuracy doesn't converge much. I tried changing the model's hyperparameters (especially prior variables and posterior_rho) but didn't that much. I also added sharpen=True for loss sharpening but nothing changed.
The model:
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
##### Bayesian version #####
from layers.lstm_bayesian_layer import BayesianLSTM
from blitz.utils import variational_estimator
from layers.linear_bayesian_layer import BayesianLinear
from layers.attention import Attention, NoQueryAttention
from layers.squeeze_embedding import SqueezeEmbedding
@variational_estimator
class LSTM_BAYES_RNN(nn.Module):
def __init__(self, embedding_matrix, opt):
super(LSTM_BAYES_RNN, self).__init__()
self.lstm = BayesianLSTM(opt.embed_dim*2, opt.hidden_dim, bias=True, freeze = False,
prior_sigma_1 = 5,
prior_sigma_2 = 5,
posterior_rho_init=1,
sharpen=True)
# prior_pi = 1,
# posterior_mu_init = 0,
# posterior_rho_init = -6.0,
self.opt = opt
self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))
self.squeeze_embedding = SqueezeEmbedding()
# self.dense = BayesianLinear(opt.hidden_dim, opt.polarities_dim, bias=True, freeze = False,
# prior_sigma_1 = 10, prior_sigma_2 = 10, posterior_rho_init = 5 )
self.attention = NoQueryAttention(opt.hidden_dim+opt.embed_dim, score_function='bi_linear')
self.dense = nn.Linear(opt.hidden_dim, opt.polarities_dim)
def forward(self, inputs):
text_indices, aspect_indices = inputs[0], inputs[1]
x_len = torch.sum(text_indices != 0, dim=-1)
x_len_max = torch.max(x_len)
aspect_len = torch.sum(aspect_indices != 0, dim=-1).float()
x = self.embed(text_indices)
x = self.squeeze_embedding(x, x_len)
aspect = self.embed(aspect_indices)
aspect_pool = torch.div(torch.sum(aspect, dim=1), aspect_len.unsqueeze(1))
aspect = aspect_pool.unsqueeze(1).expand(-1, x_len_max, -1)
x = torch.cat((aspect, x), dim=-1)
h, (_, _) = self.lstm(x)
ha = torch.cat((h, aspect), dim=-1)
_, score = self.attention(ha)
output = torch.squeeze(torch.bmm(score, h), dim=1)
out = self.dense(output)
return out
in the training I have
# bayesian loss calculation
pi_weight = minibatch_weight(batch_idx=i_batch, num_batches=self.opt.batch_size)
loss = self.model.sample_elbo(
inputs=inputs,
labels=targets,
criterion=nn.CrossEntropyLoss(),
sample_nbr=10,
# complexity_cost_weight=1/len(self.trainset))
complexity_cost_weight = pi_weight)
##################
loss.backward()
optimizer.step()
# take 3 outputs per example
outputs = torch.stack([self.model(inputs) for i in range(3)])
preds = torch.mean(outputs, axis=0)
What's the problem here?
Metadata
Metadata
Assignees
Labels
No labels