Merge pull request #140 from lucidrains/pw/add-generator-loss-top-k

lucidrains · web-flow · commit a59e8e5608f0 · 2020-09-29T09:54:48.000-07:00
add Top-k Generator Training
diff --git a/README.md b/README.md
@@ -260,6 +260,15 @@ By default, the StyleGAN architecture styles a constant learned 4x4 block as it
 $ stylegan2_pytorch --data ./data --no-const
 ```
 
+## Top-k Training for Generator
+
+A new paper has produced evidence that by simply zero-ing out the gradient contributions from samples that are deemed fake by the discriminator, the generator learns significantly better, achieving new state of the art.
+
+```python
+$ stylegan2_pytorch --data ./data --generate-top-k --generate-top-k-frac 0.5 --generate-top-k-gamma 0.99
+```
+
+Gamma is a decay schedule that slowly decreases the topk from the full batch size to the target fraction of 50% (also modifiable hyperparameter).
 
 ## Appreciation
 
@@ -373,3 +382,14 @@ Thank you to Matthew Mann for his inspiring [simple port](https://github.com/man
     primaryClass = {cs.LG}
 }
 ```
+
+```bibtex
+@misc{sinha2020topk,
+      title   = {Top-k Training of GANs: Improving GAN Performance by Throwing Away Bad Samples},
+      author  = {Samarth Sinha and Zhengli Zhao and Anirudh Goyal and Colin Raffel and Augustus Odena},
+      year    = {2020},
+      eprint  = {2002.06224},
+      archivePrefix = {arXiv},
+      primaryClass = {stat.ML}
+}
+```
diff --git a/stylegan2_pytorch/cli.py b/stylegan2_pytorch/cli.py
@@ -96,6 +96,9 @@ def train_from_folder(
     no_const = False,
     aug_prob = 0.,
     aug_types = ['translation', 'cutout'],
+    generator_top_k = False,
+    generator_top_k_gamma = 0.99,
+    generator_top_k_frac = 0.5,
     dataset_aug_prob = 0.,
     multi_gpus = False,
     calculate_fid_every = None
@@ -124,6 +127,9 @@ def train_from_folder(
         no_const = no_const,
         aug_prob = aug_prob,
         aug_types = cast_list(aug_types),
+        generator_top_k = generator_top_k,
+        generator_top_k_gamma = generator_top_k_gamma,
+        generator_top_k_frac = generator_top_k_frac,
         dataset_aug_prob = dataset_aug_prob,
         calculate_fid_every = calculate_fid_every
     )
diff --git a/stylegan2_pytorch/stylegan2_pytorch.py b/stylegan2_pytorch/stylegan2_pytorch.py
@@ -685,7 +685,7 @@ def forward(self, x):
         return x
 
 class Trainer():
-    def __init__(self, name, results_dir, models_dir, image_size, network_capacity, transparent = False, batch_size = 4, mixed_prob = 0.9, gradient_accumulate_every=1, lr = 2e-4, lr_mlp = 1., ttur_mult = 2, rel_disc_loss = False, num_workers = None, save_every = 1000, trunc_psi = 0.6, fp16 = False, cl_reg = False, fq_layers = [], fq_dict_size = 256, attn_layers = [], no_const = False, aug_prob = 0., aug_types = ['translation', 'cutout'], dataset_aug_prob = 0., calculate_fid_every = None, is_ddp = False, rank = 0, world_size = 1, *args, **kwargs):
+    def __init__(self, name, results_dir, models_dir, image_size, network_capacity, transparent = False, batch_size = 4, mixed_prob = 0.9, gradient_accumulate_every=1, lr = 2e-4, lr_mlp = 1., ttur_mult = 2, rel_disc_loss = False, num_workers = None, save_every = 1000, trunc_psi = 0.6, fp16 = False, cl_reg = False, fq_layers = [], fq_dict_size = 256, attn_layers = [], no_const = False, aug_prob = 0., aug_types = ['translation', 'cutout'], generator_top_k = False, generator_top_k_gamma = 0.99, generator_top_k_frac = 0.5, dataset_aug_prob = 0., calculate_fid_every = None, is_ddp = False, rank = 0, world_size = 1, *args, **kwargs):
         self.GAN_params = [args, kwargs]
         self.GAN = None
 
@@ -747,6 +747,10 @@ def __init__(self, name, results_dir, models_dir, image_size, network_capacity,
 
         self.calculate_fid_every = calculate_fid_every
 
+        self.generator_top_k = generator_top_k
+        self.generator_top_k_gamma = generator_top_k_gamma
+        self.generator_top_k_frac = generator_top_k_frac
+
         assert not (is_ddp and cl_reg), 'Contrastive loss regularization does not work well with multi GPUs yet'
         self.is_ddp = is_ddp
         self.is_main = rank == 0
@@ -912,7 +916,17 @@ def train(self):
 
             generated_images = G(w_styles, noise)
             fake_output, _ = D_aug(generated_images, **aug_kwargs)
-            loss = fake_output.mean()
+            fake_output_loss = fake_output
+
+            if self.generator_top_k:
+                epochs = (self.steps * batch_size * self.gradient_accumulate_every) / len(self.dataset)
+                k_frac = max(self.generator_top_k_gamma ** epochs, self.generator_top_k_frac)
+                k = math.ceil(batch_size * k_frac)
+
+                if k != batch_size:
+                    fake_output_loss, _ = fake_output_loss.topk(k=k, largest=False)
+
+            loss = fake_output_loss.mean()
             gen_loss = loss
 
             if apply_path_penalty: