add image augmentation feature for low data settings

lucidrains · lucidrains · commit 59bc8d49ed1d · 2020-06-25T10:08:10.000-07:00
diff --git a/README.md b/README.md
@@ -79,7 +79,22 @@ If a previous checkpoint contained a better generator, (which often happens as g
 $ stylegan2_pytorch --generate --load-from {checkpoint number}
 ```
 
-### Attention
+## Low amounts of Training Data
+
+In the past, GANs needed a lot of data to learn how to generate well. The faces model took **70k** high quality images from Flickr, as an example.
+
+However, in the month of May 2020, researchers all across the world independently converged on a simple technique to reduce that number to as low as **1-2k**. That simple idea was to differentiably augment all images, generated or real, going into the discriminator during training.
+
+If one were to augment at a low enough probability, the augmentations will not 'leak' into the generations.
+
+In the setting of low data, you can use the feature with a simple flag.
+
+```bash
+# find a suitable probability between 0. -> 0.7 at maximum
+$ stylegan2_pytorch --data ./data --aug-prob 0.25
+```
+
+## Attention
 
 This framework also allows for you to add an efficient form of self-attention to the designated layers of the discriminator (and the symmetric layer of the generator), which will greatly improve results. The more attention you can afford, the better!
 
@@ -277,4 +292,15 @@ Thank you to Matthew Mann for his inspiring [simple port](https://github.com/man
     eprint = {2006.02595},
     archivePrefix = {arXiv}
 }
+```
+
+```bibtex
+@misc{karras2020training,
+    title   = {Training Generative Adversarial Networks with Limited Data},
+    author  = {Tero Karras and Miika Aittala and Janne Hellsten and Samuli Laine and Jaakko Lehtinen and Timo Aila},
+    year    = {2020},
+    eprint  = {2006.06676},
+    archivePrefix = {arXiv},
+    primaryClass = {cs.CV}
+}
 ```
diff --git a/bin/stylegan2_pytorch b/bin/stylegan2_pytorch
@@ -30,7 +30,8 @@ def train_from_folder(
     fq_layers = [],
     fq_dict_size = 256,
     attn_layers = [],
-    no_const = False
+    no_const = False,
+    aug_prob = 0.
 ):
     model = Trainer(
         name,        
@@ -50,7 +51,8 @@ def train_from_folder(
         fq_layers = fq_layers,
         fq_dict_size = fq_dict_size,
         attn_layers = attn_layers,
-        no_const = no_const
+        no_const = no_const,
+        aug_prob = aug_prob
     )
 
     if not new:
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
   name = 'stylegan2_pytorch',
   packages = find_packages(),
   scripts=['bin/stylegan2_pytorch'],
-  version = '0.15.0',
+  version = '0.16.0',
   license='GPLv3+',
   description = 'StyleGan2 in Pytorch',
   author = 'Phil Wang',
diff --git a/stylegan2_pytorch/stylegan2_pytorch.py b/stylegan2_pytorch/stylegan2_pytorch.py
@@ -213,7 +213,6 @@ def __init__(self, folder, image_size, transparent = False):
         self.transform = transforms.Compose([
             transforms.Lambda(convert_image_fn),
             transforms.Lambda(partial(resize_to_minimum_size, image_size)),
-            transforms.RandomHorizontalFlip(),
             transforms.Resize(image_size),
             transforms.CenterCrop(image_size),
             transforms.ToTensor(),
@@ -228,6 +227,41 @@ def __getitem__(self, index):
         img = Image.open(path)
         return self.transform(img)
 
+# augmentations
+
+def random_float(lo, hi):
+    return lo + (hi - lo) * random()
+
+def random_crop_and_resize(tensor, scale):
+    b, c, h, _ = tensor.shape
+    new_width = int(h * scale)
+    delta = h - new_width
+    h_delta = int(random() * delta)
+    w_delta = int(random() * delta)
+    cropped = tensor[:, :, h_delta:(h_delta + new_width), w_delta:(w_delta + new_width)].clone()
+    return F.interpolate(cropped, size=(h, h))
+
+def random_hflip(tensor, prob):
+    if prob > random():
+        return tensor
+    return torch.flip(tensor, dims=(3,))
+
+class AugWrapper(nn.Module):
+    def __init__(self, D, image_size):
+        super().__init__()
+        self.D = D
+
+    def forward(self, images, prob = 0., detach = False):
+        if random() < prob:
+            random_scale = random_float(0.5, 0.9)
+            images = random_hflip(images, prob=0.5)
+            images = random_crop_and_resize(images, scale = random_scale)
+
+        if detach:
+            images.detach_()
+
+        return self.D(images)
+
 # stylegan2 classes
 
 class StyleVectorizer(nn.Module):
@@ -495,7 +529,10 @@ def __init__(self, image_size, latent_dim = 512, style_depth = 8, network_capaci
         self.GE = Generator(image_size, latent_dim, network_capacity, transparent = transparent, attn_layers = attn_layers, no_const = no_const)
 
         # experimental contrastive loss discriminator regularization
-        self.D_cl = ContrastiveLearner(self.D, image_size, hidden_layer='flatten') if cl_reg else None
+        self.D_cl = None
+
+        # wrapper for augmenting all images going into the discriminator
+        self.D_aug = AugWrapper(self.D, image_size)
 
         set_requires_grad(self.SE, False)
         set_requires_grad(self.GE, False)
@@ -540,7 +577,7 @@ def forward(self, x):
         return x
 
 class Trainer():
-    def __init__(self, name, results_dir, models_dir, image_size, network_capacity, transparent = False, batch_size = 4, mixed_prob = 0.9, gradient_accumulate_every=1, lr = 2e-4, num_workers = None, save_every = 1000, trunc_psi = 0.6, fp16 = False, cl_reg = False, fq_layers = [], fq_dict_size = 256, attn_layers = [], no_const = False, *args, **kwargs):
+    def __init__(self, name, results_dir, models_dir, image_size, network_capacity, transparent = False, batch_size = 4, mixed_prob = 0.9, gradient_accumulate_every=1, lr = 2e-4, num_workers = None, save_every = 1000, trunc_psi = 0.6, fp16 = False, cl_reg = False, fq_layers = [], fq_dict_size = 256, attn_layers = [], no_const = False, aug_prob = 0., *args, **kwargs):
         self.GAN_params = [args, kwargs]
         self.GAN = None
 
@@ -558,6 +595,7 @@ def __init__(self, name, results_dir, models_dir, image_size, network_capacity,
 
         self.attn_layers = cast_list(attn_layers)
         self.no_const = no_const
+        self.aug_prob = aug_prob
 
         self.lr = lr
         self.batch_size = batch_size
@@ -632,6 +670,8 @@ def train(self):
         latent_dim = self.GAN.G.latent_dim
         num_layers = self.GAN.G.num_layers
 
+        aug_prob   = self.aug_prob
+
         apply_gradient_penalty = self.steps % 4 == 0
         apply_path_penalty = self.steps % 32 == 0
         apply_cl_reg_to_generated = self.steps > 20000
@@ -677,11 +717,11 @@ def train(self):
             w_styles = styles_def_to_tensor(w_space)
 
             generated_images = self.GAN.G(w_styles, noise)
-            fake_output, fake_q_loss = self.GAN.D(generated_images.clone().detach())
+            fake_output, fake_q_loss = self.GAN.D_aug(generated_images.clone().detach(), detach = True, prob = aug_prob)
 
             image_batch = next(self.loader).cuda()
             image_batch.requires_grad_()
-            real_output, real_q_loss = self.GAN.D(image_batch)
+            real_output, real_q_loss = self.GAN.D_aug(image_batch, prob = aug_prob)
 
             divergence = (F.relu(1 + real_output) + F.relu(1 - fake_output)).mean()
             disc_loss = divergence
@@ -716,7 +756,7 @@ def train(self):
             w_styles = styles_def_to_tensor(w_space)
 
             generated_images = self.GAN.G(w_styles, noise)
-            fake_output, _ = self.GAN.D(generated_images)
+            fake_output, _ = self.GAN.D_aug(generated_images, prob = aug_prob)
             loss = fake_output.mean()
             gen_loss = loss