Configure batchesPerIteration relative to the nodeCount

FlorentinD · breakanalysis · FlorentinD · commit 5f00ce82aea9 · 2022-05-10T12:32:42.000+02:00
Using a ratio instead of an absolute number

Co-authored-by: Jacob Sznajdman &lt;breakanalysis@gmail.com&gt;
diff --git a/algo/src/main/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainer.java b/algo/src/main/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainer.java
@@ -130,21 +130,23 @@ public ModelTrainResult train(Graph graph, HugeObjectArray<double[]> features) {
             config.batchSize(),
             batch -> createBatchTask(graph, features, layers, weights, batch)
         );
+        var random = new Random(randomSeed);
+        Supplier<List<BatchTask>> batchTaskSampler = () -> IntStream.range(0, config.batchesPerIteration(graph.nodeCount()))
+            .mapToObj(__ -> batchTasks.get(random.nextInt(batchTasks.size())))
+            .collect(Collectors.toList());
 
         progressTracker.endSubTask("Prepare batches");
 
+        progressTracker.beginSubTask("Train model");
+
         boolean converged = false;
         var iterationLossesPerEpoch = new ArrayList<List<Double>>();
-
         var prevEpochLoss = Double.NaN;
-        var random = new Random(randomSeed);
-
-        progressTracker.beginSubTask("Train model");
-
         int epochs = config.epochs();
+
         for (int epoch = 1; epoch <= epochs && !converged; epoch++) {
             progressTracker.beginSubTask("Epoch");
-            var epochResult = trainEpoch(() -> batchTasks.get(random.nextInt(batchTasks.size())), weights, prevEpochLoss);
+            var epochResult = trainEpoch(batchTaskSampler, weights, prevEpochLoss);
             List<Double> epochLosses = epochResult.losses();
             iterationLossesPerEpoch.add(epochLosses);
             prevEpochLoss = epochLosses.get(epochLosses.size() - 1);
@@ -188,7 +190,11 @@ private BatchTask createBatchTask(
         return new BatchTask(lossFunction, weights, progressTracker);
     }
 
-    private EpochResult trainEpoch(Supplier<BatchTask> batchTaskSupplier, List<Weights<? extends Tensor<?>>> weights, double prevEpochLoss) {
+    private EpochResult trainEpoch(
+        Supplier<List<BatchTask>> sampledBatchTaskSupplier,
+        List<Weights<? extends Tensor<?>>> weights,
+        double prevEpochLoss
+    ) {
         var updater = new AdamOptimizer(weights, config.learningRate());
 
         int iteration = 1;
@@ -200,14 +206,11 @@ private EpochResult trainEpoch(Supplier<BatchTask> batchTaskSupplier, List<Weigh
         for (; iteration <= maxIterations; iteration++) {
             progressTracker.beginSubTask("Iteration");
 
-            var batchTasks = IntStream
-                .range(0, config.batchesPerIteration())
-                .mapToObj(__ -> batchTaskSupplier.get())
-                .collect(Collectors.toList());
+            var sampledBatchTasks = sampledBatchTaskSupplier.get();
 
             // run forward + maybe backward for each Batch
-            ParallelUtil.runWithConcurrency(config.concurrency(), batchTasks, executor);
-            var avgLoss = batchTasks.stream().mapToDouble(BatchTask::loss).average().orElseThrow();
+            ParallelUtil.runWithConcurrency(config.concurrency(), sampledBatchTasks, executor);
+            var avgLoss = sampledBatchTasks.stream().mapToDouble(BatchTask::loss).average().orElseThrow();
             iterationLosses.add(avgLoss);
             progressTracker.logMessage(formatWithLocale("LOSS: %.10f", avgLoss));
 
@@ -219,7 +222,7 @@ private EpochResult trainEpoch(Supplier<BatchTask> batchTaskSupplier, List<Weigh
 
             prevLoss = avgLoss;
 
-            var batchedGradients = batchTasks
+            var batchedGradients = sampledBatchTasks
                 .stream()
                 .map(BatchTask::weightGradients)
                 .collect(Collectors.toList());
@@ -246,7 +249,7 @@ static class BatchTask implements Runnable {
         private final List<Weights<? extends Tensor<?>>> weightVariables;
         private List<? extends Tensor<?>> weightGradients;
         private final ProgressTracker progressTracker;
-        private double prevLoss;
+        private double loss;
 
         BatchTask(
             Variable<Scalar> lossFunction,
@@ -261,9 +264,7 @@ static class BatchTask implements Runnable {
         @Override
         public void run() {
             var localCtx = new ComputationContext();
-            var loss = localCtx.forward(lossFunction).value();
-
-            prevLoss = loss;
+            loss = localCtx.forward(lossFunction).value();
 
             localCtx.backward(lossFunction);
             weightGradients = weightVariables.stream().map(localCtx::gradient).collect(Collectors.toList());
@@ -272,7 +273,7 @@ public void run() {
         }
 
         public double loss() {
-            return prevLoss;
+            return loss;
         }
 
         List<? extends Tensor<?>> weightGradients() {
diff --git a/algo/src/main/java/org/neo4j/gds/embeddings/graphsage/algo/GraphSageTrainConfig.java b/algo/src/main/java/org/neo4j/gds/embeddings/graphsage/algo/GraphSageTrainConfig.java
@@ -120,13 +120,16 @@ default int maxIterations() {
         return 10;
     }
 
-    @Configuration.Key("batchesPerIteration")
-    Optional<Integer> maybeBatchesPerIteration();
+    @Configuration.Key("batchSamplingRatio")
+    @Configuration.DoubleRange(min = 0, max = 1, minInclusive = false)
+    Optional<Double> maybeBatchSamplingRatio();
 
     @Configuration.Ignore
     @Value.Derived
-    default int batchesPerIteration() {
-        return maybeBatchesPerIteration().orElse(concurrency());
+    default int batchesPerIteration(long nodeCount) {
+        var samplingRatio = maybeBatchSamplingRatio().orElse(Math.min(1.0, batchSize() * concurrency() / (double) nodeCount));
+        var totalNumberOfBatches = Math.ceil(nodeCount / (double) batchSize());
+        return (int) Math.ceil(samplingRatio * totalNumberOfBatches);
     }
 
     @Value.Default
diff --git a/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainerTest.java b/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainerTest.java
@@ -310,13 +310,13 @@ void testConvergence() {
 
     @ParameterizedTest
     @CsvSource({
-        "1, true, 8",
-        "5, false, 10"
+        "0.01, true, 8",
+        "1.0, false, 10"
     })
-    void batchesPerIteration(int batchesPerIteration, boolean expectedConvergence, int expectedRanEpochs) {
+    void batchesPerIteration(double batchSamplingRatio, boolean expectedConvergence, int expectedRanEpochs) {
         var trainer = new GraphSageModelTrainer(
             configBuilder.modelName("convergingModel:)")
-                .maybeBatchesPerIteration(batchesPerIteration)
+                .maybeBatchSamplingRatio(batchSamplingRatio)
                 .embeddingDimension(12)
                 .aggregator(AggregatorType.POOL)
                 .epochs(10)
diff --git a/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/algo/GraphSageConfigTest.java b/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/algo/GraphSageConfigTest.java
@@ -22,6 +22,7 @@
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.CsvSource;
 import org.junit.jupiter.params.provider.MethodSource;
 import org.junit.jupiter.params.provider.ValueSource;
 import org.neo4j.gds.core.CypherMapWrapper;
@@ -51,15 +52,21 @@ private static Stream<Arguments> invalidAggregator() {
         );
     }
 
-    @Test
-    void specifyBatchesPerIteration() {
+    @ParameterizedTest
+    @CsvSource({
+        "0.5, 100, 1",
+        "0.2, 1000, 2",
+        "0.99, 1000, 10",
+    })
+    void specifyBatchesPerIteration(double samplingRatio, long nodeCount, int expectedSampledBatches) {
         var mapWrapper = CypherMapWrapper.create(Map.of(
             "modelName", "foo",
             "featureProperties", List.of("a"),
-            "batchesPerIteration", 42
+            "batchSamplingRatio", samplingRatio,
+            "batchSize", 100
         ));
 
-        assertThat(GraphSageTrainConfig.of("user", mapWrapper).batchesPerIteration()).isEqualTo(42);
+        assertThat(GraphSageTrainConfig.of("user", mapWrapper).batchesPerIteration(nodeCount)).isEqualTo(expectedSampledBatches);
     }
 
     @Test
diff --git a/doc/asciidoc/machine-learning/node-embeddings/graph-sage/specific-train-configuration.adoc b/doc/asciidoc/machine-learning/node-embeddings/graph-sage/specific-train-configuration.adoc
@@ -14,7 +14,7 @@
 | learningRate                                                                     | Float         | 0.1       | yes      | The learning rate determines the step size at each iteration while moving toward a minimum of a loss function.
 | epochs                                                                           | Integer       | 1         | yes      | Number of times to traverse the graph.
 | <<common-configuration-max-iterations,maxIterations>>                            | Integer       | 10        | yes      | Maximum number of iterations per epoch. Each iteration the weights are updated.
-| <<common-configuration-max-iterations,batchesPerIteration>>                      | Integer       | `concurrency` | yes  | Number of batches to consider per weight updates.
+| batchSamplingRatio                                                                | Float       | `concurrency * batchSize / nodeCount` | yes  | Sampling ratio of batches to consider per weight updates. By default, each thread evaluates a single batch. The gradients per batch are averaged to update the weights.
 | searchDepth                                                                      | Integer       | 5         | yes      | Maximum depth of the RandomWalks to sample nearby nodes for the training.
 | negativeSampleWeight                                                             | Integer       | 20        | yes      | The weight of the negative samples. Higher values increase the impact of negative samples in the loss.
 | <<common-configuration-relationship-weight-property,relationshipWeightProperty>> | String        | null      | yes      | Name of the relationship property to use as weights. If unspecified, the algorithm runs unweighted.