Modified implementation to supports CIFAR10 dataset

cedrickchee · cedrickchee · commit 311a4ba062bd · 2018-01-30T22:52:30.000+08:00
diff --git a/decoder.py b/decoder.py
@@ -27,20 +27,22 @@ class Decoder(nn.Module):
     This Decoder network is used in training and prediction (testing).
     """
 
-    def __init__(self, num_classes, output_unit_size, cuda_enabled):
+    def __init__(self, num_classes, output_unit_size, input_width,
+                 input_height, num_conv_in_channel, cuda_enabled):
         """
         The decoder network consists of 3 fully connected layers, with
-        512, 1024, 784 neurons each.
+        512, 1024, 784 (or 3072 for CIFAR10) neurons each.
         """
         super(Decoder, self).__init__()
 
         self.cuda_enabled = cuda_enabled
 
         fc1_output_size = 512
         fc2_output_size = 1024
+        self.fc3_output_size = input_width * input_height * num_conv_in_channel
         self.fc1 = nn.Linear(num_classes * output_unit_size, fc1_output_size) # input dim 10 * 16.
         self.fc2 = nn.Linear(fc1_output_size, fc2_output_size)
-        self.fc3 = nn.Linear(fc2_output_size, 784)
+        self.fc3 = nn.Linear(fc2_output_size, self.fc3_output_size)
         # Activation functions
         self.relu = nn.ReLU(inplace=True)
         self.sigmoid = nn.Sigmoid()
@@ -49,14 +51,14 @@ def forward(self, x, target):
         """
         We send the outputs of the `DigitCaps` layer, which is a
         [batch_size, 10, 16] size tensor into the Decoder network, and
-        reconstruct a [batch_size, 784] size tensor representing the image.
+        reconstruct a [batch_size, fc3_output_size] size tensor representing the image.
 
         Args:
             x: [batch_size, 10, 16] The output of the digit capsule.
             target: [batch_size, 10] One-hot MNIST dataset labels.
 
         Returns:
-            reconstruction: [batch_size, 784] Tensor of reconstructed images.
+            reconstruction: [batch_size, fc3_output_size] Tensor of reconstructed images.
         """
         batch_size = target.size(0)
 
@@ -77,8 +79,8 @@ def forward(self, x, target):
         # Forward pass of the network
         fc1_out = self.relu(self.fc1(vector_j))
         fc2_out = self.relu(self.fc2(fc1_out)) # shape: [batch_size, 1024]
-        reconstruction = self.sigmoid(self.fc3(fc2_out)) # shape: [batch_size, 784]
+        reconstruction = self.sigmoid(self.fc3(fc2_out)) # shape: [batch_size, fc3_output_size]
 
-        assert reconstruction.size() == torch.Size([batch_size, 784])
+        assert reconstruction.size() == torch.Size([batch_size, self.fc3_output_size])
 
         return reconstruction
diff --git a/main.py b/main.py
@@ -177,11 +177,13 @@ def test(model, data_loader, num_train_batches, epoch, writer):
     # Get the reconstructed images of the last batch
     if args.use_reconstruction_loss:
         reconstruction = model.decoder(output, target)
-        image_width = 28 # MNIST digit image width
-        image_height = 28 # MNIST digit image height
-        image_channel = 1 # MNIST digit image channel
+        # Input image size and number of channel.
+        # By default, for MNIST, the image width and height is 28x28 and 1 channel for black/white.
+        image_width = args.input_width
+        image_height = args.input_height
+        image_channel = args.num_conv_in_channel
         recon_img = reconstruction.view(-1, image_channel, image_width, image_height)
-        assert recon_img.size() == torch.Size([batch_size, 1, 28, 28])
+        assert recon_img.size() == torch.Size([batch_size, image_channel, image_width, image_height])
 
         # Save the image into file system
         utils.save_image(recon_img, 'results/recons_image_test_{}_{}.png'.format(epoch, global_step))
@@ -264,6 +266,11 @@ def main():
                         help='use an additional reconstruction loss. default=True')
     parser.add_argument('--regularization-scale', type=float, default=0.0005,
                         help='regularization coefficient for reconstruction loss. default=0.0005')
+    parser.add_argument('--dataset', help='the name of dataset (mnist, cifar10)', default='mnist')
+    parser.add_argument('--input-width', type=int,
+                        default=28, help='input image width to the convolution. default=28 for MNIST')
+    parser.add_argument('--input-height', type=int,
+                        default=28, help='input image height to the convolution. default=28 for MNIST')
 
     args = parser.parse_args()
 
@@ -278,7 +285,7 @@ def main():
         torch.cuda.manual_seed(args.seed)
 
     # Load data
-    train_loader, test_loader = utils.load_mnist(args)
+    train_loader, test_loader = utils.load_data(args)
 
     # Build Capsule Network
     print('===> Building model')
@@ -291,6 +298,8 @@ def main():
                 num_routing=args.num_routing,
                 use_reconstruction_loss=args.use_reconstruction_loss,
                 regularization_scale=args.regularization_scale,
+                input_width=args.input_width,
+                input_height=args.input_height,
                 cuda_enabled=args.cuda)
 
     if args.cuda:
@@ -307,12 +316,14 @@ def main():
     for name, param in model.named_parameters():
         print('{}: {}'.format(name, list(param.size())))
 
-    # CapsNet has 8.2M parameters and 6.8M parameters without the reconstruction subnet.
+    # CapsNet has:
+    # - 8.2M parameters and 6.8M parameters without the reconstruction subnet on MNIST.
+    # - 11.8M parameters and 8.0M parameters without the reconstruction subnet on CIFAR10.
     num_params = sum([param.nelement() for param in model.parameters()])
 
     # The coupling coefficients c_ij are not included in the parameter list,
-    # we need to add them manually, which is 1152 * 10 = 11520.
-    print('\nTotal number of parameters: {}\n'.format(num_params + 11520))
+    # we need to add them manually, which is 1152 * 10 = 11520 (on MNIST) or 2048 * 10 (on CIFAR10)
+    print('\nTotal number of parameters: {}\n'.format(num_params + (11520 if args.dataset == 'mnist' else 20480)))
 
     # Optimizer
     optimizer = optim.Adam(model.parameters(), lr=args.lr)
diff --git a/model.py b/model.py
@@ -21,9 +21,10 @@ class Net(nn.Module):
     A simple CapsNet with 3 layers
     """
 
-    def __init__(self, num_conv_in_channel, num_conv_out_channel, num_primary_unit, primary_unit_size,
-                 num_classes, output_unit_size, num_routing,
-                 use_reconstruction_loss, regularization_scale, cuda_enabled):
+    def __init__(self, num_conv_in_channel, num_conv_out_channel, num_primary_unit,
+                 primary_unit_size, num_classes, output_unit_size, num_routing,
+                 use_reconstruction_loss, regularization_scale, input_width, input_height,
+                 cuda_enabled):
         """
         In the constructor we instantiate one ConvLayer module and two CapsuleLayer modules
         and assign them as member variables.
@@ -34,9 +35,12 @@ def __init__(self, num_conv_in_channel, num_conv_out_channel, num_primary_unit,
 
         # Configurations used for image reconstruction.
         self.use_reconstruction_loss = use_reconstruction_loss
-        self.image_width = 28 # MNIST digit image width
-        self.image_height = 28 # MNIST digit image height
-        self.image_channel = 1 # MNIST digit image channel
+        # Input image size and number of channel.
+        # By default, for MNIST, the image width and height is 28x28
+        # and 1 channel for black/white.
+        self.image_width = input_width
+        self.image_height = input_height
+        self.image_channel = num_conv_in_channel
 
         # Also known as lambda reconstruction. Default value is 0.0005.
         # We use sum of squared errors (SSE) similar to paper.
@@ -69,7 +73,8 @@ def __init__(self, num_conv_in_channel, num_conv_out_channel, num_primary_unit,
 
         # Reconstruction network
         if use_reconstruction_loss:
-            self.decoder = Decoder(num_classes, output_unit_size, cuda_enabled)
+            self.decoder = Decoder(num_classes, output_unit_size, input_width,
+                                   input_height, num_conv_in_channel, cuda_enabled)
 
     def forward(self, x):
         """