apache · jim0987795064 · May 2, 2025 · May 5, 2025 · May 20, 2025 · May 20, 2025
diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java b/clients/src/main/java/org/apache/kafka/clients/producer/KafkaProducer.java
@@ -1124,12 +1124,30 @@ private Future<RecordMetadata> doSend(ProducerRecord<K, V> record, Callback call
             ensureValidRecordSize(serializedSize);
             long timestamp = record.timestamp() == null ? nowMs : record.timestamp();
 
+            // A custom RoundRobinPartitioner may take advantage on the onNewBatch callback.
+            boolean abortOnNewBatch = false;
+            if (partitionerPlugin.get() instanceof RoundRobinPartitioner) {
+                abortOnNewBatch = true;
+            }
+
             // Append the record to the accumulator.  Note, that the actual partition may be
             // calculated there and can be accessed via appendCallbacks.topicPartition.
             RecordAccumulator.RecordAppendResult result = accumulator.append(record.topic(), partition, timestamp, serializedKey,
-                    serializedValue, headers, appendCallbacks, remainingWaitMs, nowMs, cluster);
+                    serializedValue, headers, appendCallbacks, remainingWaitMs, nowMs, cluster, abortOnNewBatch);
             assert appendCallbacks.getPartition() != RecordMetadata.UNKNOWN_PARTITION;
 
+            // Notify the RoundRobinPartitioner that the previous batch is full, and request it to return prevPartition to the idle queue.
+            if (result.abortOnNewBatch) {
+                int prevPartition = partition;
+                ((RoundRobinPartitioner) partitionerPlugin.get()).onNewBatch(record.topic(), cluster, prevPartition);
+                partition = partition(record, serializedKey, serializedValue, cluster);
+                if (log.isTraceEnabled()) {
+                    log.trace("Retrying append due to new batch creation for topic {} partition {}. The old partition was {}", record.topic(), partition, prevPartition);
+                }
+                result = accumulator.append(record.topic(), partition, timestamp, serializedKey,
+                    serializedValue, headers, appendCallbacks, remainingWaitMs, nowMs, cluster, false);
+            }
+
             // Add the partition to the transaction (if in progress) after it has been successfully
             // appended to the accumulator. We cannot do it before because the partition may be
             // unknown. Note that the `Sender` will refuse to dequeue

diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/RoundRobinPartitioner.java b/clients/src/main/java/org/apache/kafka/clients/producer/RoundRobinPartitioner.java
@@ -20,9 +20,14 @@
 import org.apache.kafka.common.PartitionInfo;
 import org.apache.kafka.common.utils.Utils;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import java.util.List;
 import java.util.Map;
+import java.util.Queue;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.atomic.AtomicInteger;
 
@@ -33,9 +38,15 @@
  * to distribute the writes to all partitions equally. This
  * is the behaviour regardless of record key hash. 
  *
+ * The "Round-Robin" partitioner - MODIFIED TO WORK PROPERLY WITH STICKY PARTITIONING (KIP-480)
+ * <p>
+ * This partitioning strategy can be used when user wants to distribute the writes to all
+ * partitions equally. This is the behaviour regardless of record key hash.
  */
 public class RoundRobinPartitioner implements Partitioner {
+    private static final Logger LOGGER = LoggerFactory.getLogger(RoundRobinPartitioner.class);
     private final ConcurrentMap<String, AtomicInteger> topicCounterMap = new ConcurrentHashMap<>();
+    private final ConcurrentMap<String, Queue<Integer>> topicPartitionQueueMap = new ConcurrentHashMap<>();
 
     public void configure(Map<String, ?> configs) {}
 
@@ -51,15 +62,25 @@ public void configure(Map<String, ?> configs) {}
      */
     @Override
     public int partition(String topic, Object key, byte[] keyBytes, Object value, byte[] valueBytes, Cluster cluster) {
-        int nextValue = nextValue(topic);
-        List<PartitionInfo> availablePartitions = cluster.availablePartitionsForTopic(topic);
-        if (!availablePartitions.isEmpty()) {
-            int part = Utils.toPositive(nextValue) % availablePartitions.size();
-            return availablePartitions.get(part).partition();
+        Queue<Integer> partitionQueue = partitionQueueComputeIfAbsent(topic);
+        Integer queuedPartition = partitionQueue.poll();
+        if (queuedPartition != null) {
+            LOGGER.trace("Partition chosen from queue: {}", queuedPartition);
+            return queuedPartition;
         } else {
-            // no partitions are available, give a non-available partition
-            int numPartitions = cluster.partitionsForTopic(topic).size();
-            return Utils.toPositive(nextValue) % numPartitions;
+            List<PartitionInfo> partitions = cluster.partitionsForTopic(topic);
+            int numPartitions = partitions.size();
+            int nextValue = nextValue(topic);
+            List<PartitionInfo> availablePartitions = cluster.availablePartitionsForTopic(topic);
+            if (!availablePartitions.isEmpty()) {
+                int part = Utils.toPositive(nextValue) % availablePartitions.size();
+                int partition = availablePartitions.get(part).partition();
+                LOGGER.trace("Partition chosen: {}", partition);
+                return partition;
+            } else {
+                // no partitions are available, give a non-available partition
+                return Utils.toPositive(nextValue) % numPartitions;
+            }
         }
     }
 
@@ -68,5 +89,25 @@ private int nextValue(String topic) {
         return counter.getAndIncrement();
     }
 
+    private Queue<Integer> partitionQueueComputeIfAbsent(String topic) {
+        return topicPartitionQueueMap.computeIfAbsent(topic, k -> new ConcurrentLinkedQueue<>());
+    }
+
     public void close() {}
+
+    /**
+     * Notifies the partitioner that a new batch is about to be created.
+     * When using the RoundRobinPartitioner,
+     * this method helps preserve partition order across batches in multi-threaded scenarios.
+     *
+     * @param topic         The topic name
+     * @param cluster       The current cluster metadata
+     * @param prevPartition The partition previously selected for the record that triggered a new
+     *                      batch
+     */
+    public void onNewBatch(String topic, Cluster cluster, int prevPartition) {
+        LOGGER.trace("New batch so enqueuing partition {} for topic {}", prevPartition, topic);
+        Queue<Integer> partitionQueue = partitionQueueComputeIfAbsent(topic);
+        partitionQueue.add(prevPartition);
+    }
 }
diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/RecordAccumulator.java
@@ -281,7 +281,8 @@ public RecordAppendResult append(String topic,
                                      AppendCallbacks callbacks,
                                      long maxTimeToBlock,
                                      long nowMs,
-                                     Cluster cluster) throws InterruptedException {
+                                     Cluster cluster,
+                                     boolean abortOnNewBatch) throws InterruptedException {
         TopicInfo topicInfo = topicInfoMap.computeIfAbsent(topic, k -> new TopicInfo(createBuiltInPartitioner(logContext, k, batchSize)));
 
         // We keep track of the number of appending thread to make sure we do not miss batches in
@@ -325,6 +326,12 @@ public RecordAppendResult append(String topic,
                     }
                 }
 
+                // we don't have an in-progress record batch try to allocate a new batch
+                if (abortOnNewBatch) {
+                    // Return a result that will cause another call to append.
+                    return new RecordAppendResult(null, false, false, 0, true);
+                }
+
                 if (buffer == null) {
                     int size = Math.max(this.batchSize, AbstractRecords.estimateSizeInBytesUpperBound(
                             RecordBatch.CURRENT_MAGIC_VALUE, compression.type(), key, value, headers));
@@ -398,7 +405,7 @@ private RecordAppendResult appendNewBatch(String topic,
         dq.addLast(batch);
         incomplete.add(batch);
 
-        return new RecordAppendResult(future, dq.size() > 1 || batch.isFull(), true, batch.estimatedSizeInBytes());
+        return new RecordAppendResult(future, dq.size() > 1 || batch.isFull(), true, batch.estimatedSizeInBytes(), false);
     }
 
     private MemoryRecordsBuilder recordsBuilder(ByteBuffer buffer) {
@@ -434,7 +441,7 @@ private RecordAppendResult tryAppend(long timestamp, byte[] key, byte[] value, H
                 last.closeForRecordAppends();
             } else {
                 int appendedBytes = last.estimatedSizeInBytes() - initialBytes;
-                return new RecordAppendResult(future, deque.size() > 1 || last.isFull(), false, appendedBytes);
+                return new RecordAppendResult(future, deque.size() > 1 || last.isFull(), false, appendedBytes, false);
             }
         }
         return null;
@@ -1197,15 +1204,19 @@ public static final class RecordAppendResult {
         public final boolean batchIsFull;
         public final boolean newBatchCreated;
         public final int appendedBytes;
+        public final boolean abortOnNewBatch;
+
 
         public RecordAppendResult(FutureRecordMetadata future,
                                   boolean batchIsFull,
                                   boolean newBatchCreated,
-                                  int appendedBytes) {
+                                  int appendedBytes,
+                                  boolean abortOnNewBatch) {
             this.future = future;
             this.batchIsFull = batchIsFull;
             this.newBatchCreated = newBatchCreated;
             this.appendedBytes = appendedBytes;
+            this.abortOnNewBatch = abortOnNewBatch;
         }
     }
 

diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/KafkaProducerTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/KafkaProducerTest.java
@@ -1512,7 +1512,8 @@ public void testSendNotAllowedInPreparedTransactionState() throws Exception {
                 any(),
                 anyLong(),
                 anyLong(),
-                any()
+                any(),
+                eq(false)
             );
         }
     }
@@ -2770,7 +2771,8 @@ private <T> FutureRecordMetadata expectAppend(
             any(RecordAccumulator.AppendCallbacks.class),    // 6 <--
             anyLong(),
             anyLong(),
-            any()
+            any(),
+            eq(false)
         )).thenAnswer(invocation -> {
             RecordAccumulator.AppendCallbacks callbacks =
                 (RecordAccumulator.AppendCallbacks) invocation.getArguments()[6];
@@ -2779,7 +2781,8 @@ private <T> FutureRecordMetadata expectAppend(
                 futureRecordMetadata,
                 false,
                 false,
-                0);
+                0,
+                false);
         });
 
         return futureRecordMetadata;

diff --git a/clients/src/test/java/org/apache/kafka/clients/producer/RoundRobinPartitionerTest.java b/clients/src/test/java/org/apache/kafka/clients/producer/RoundRobinPartitionerTest.java
@@ -43,9 +43,9 @@ public void testRoundRobinWithUnavailablePartitions() {
         // Intentionally make the partition list not in partition order to test the edge
         // cases.
         List<PartitionInfo> partitions = asList(
-                new PartitionInfo("test", 1, null, NODES, NODES),
-                new PartitionInfo("test", 2, NODES[1], NODES, NODES),
-                new PartitionInfo("test", 0, NODES[0], NODES, NODES));
+            new PartitionInfo("test", 1, null, NODES, NODES),
+            new PartitionInfo("test", 2, NODES[1], NODES, NODES),
+            new PartitionInfo("test", 0, NODES[0], NODES, NODES));
         // When there are some unavailable partitions, we want to make sure that (1) we
         // always pick an available partition,
         // and (2) the available partitions are selected in a round robin way.
@@ -70,11 +70,13 @@ public void testRoundRobinWithKeyBytes() {
         final String topicA = "topicA";
         final String topicB = "topicB";
 
-        List<PartitionInfo> allPartitions = asList(new PartitionInfo(topicA, 0, NODES[0], NODES, NODES),
-                new PartitionInfo(topicA, 1, NODES[1], NODES, NODES), new PartitionInfo(topicA, 2, NODES[2], NODES, NODES),
-                new PartitionInfo(topicB, 0, NODES[0], NODES, NODES));
+        List<PartitionInfo> allPartitions = asList(
+            new PartitionInfo(topicA, 0, NODES[0], NODES, NODES),
+            new PartitionInfo(topicA, 1, NODES[1], NODES, NODES),
+            new PartitionInfo(topicA, 2, NODES[2], NODES, NODES),
+            new PartitionInfo(topicB, 0, NODES[0], NODES, NODES));
         Cluster testCluster = new Cluster("clusterId", asList(NODES[0], NODES[1], NODES[2]), allPartitions,
-                Collections.emptySet(), Collections.emptySet());
+            Collections.emptySet(), Collections.emptySet());
 
         final Map<Integer, Integer> partitionCount = new HashMap<>();
 
@@ -96,4 +98,45 @@ public void testRoundRobinWithKeyBytes() {
         assertEquals(10, partitionCount.get(1).intValue());
         assertEquals(10, partitionCount.get(2).intValue());
     }
+
+    @Test
+    public void testRoundRobinWithAbortOnNewBatch() throws Exception {
+        final String topicA = "topicA";
+        final String topicB = "topicB";
+
+        List<PartitionInfo> allPartitions = asList(
+            new PartitionInfo(topicA, 0, NODES[0], NODES, NODES),
+            new PartitionInfo(topicA, 1, NODES[0], NODES, NODES),
+            new PartitionInfo(topicA, 2, NODES[0], NODES, NODES),
+            new PartitionInfo(topicA, 3, NODES[0], NODES, NODES),
+            new PartitionInfo(topicA, 4, NODES[0], NODES, NODES),
+            new PartitionInfo(topicB, 0, NODES[1], NODES, NODES),
+            new PartitionInfo(topicB, 1, NODES[1], NODES, NODES));
+
+        Cluster testCluster = new Cluster("clusterId", asList(NODES[0], NODES[1]), allPartitions,
+                Collections.<String>emptySet(), Collections.<String>emptySet());
+
+        RoundRobinPartitioner partitioner = new RoundRobinPartitioner();
+
+        // Verify that partition selection still works correctly when queue is empty.
+        assertEquals(0, partitioner.partition(topicA, null, null, null, null, testCluster));
+        assertEquals(1, partitioner.partition(topicA, null, null, null, null, testCluster));
+        assertEquals(0, partitioner.partition(topicB, null, null, null, null, testCluster));
+
+        // Abort for new batch - previous partition should be returned on subsequent call
+        // Simulate three threads producing to two topics, with race condition in producer
+        partitioner.onNewBatch(topicA, testCluster, 0);
+        partitioner.onNewBatch(topicA, testCluster, 1);
+        partitioner.onNewBatch(topicB, testCluster, 0);
+        assertEquals(0, partitioner.partition(topicA, null, null, null, null, testCluster));
+        assertEquals(1, partitioner.partition(topicA, null, null, null, null, testCluster));
+        assertEquals(0, partitioner.partition(topicB, null, null, null, null, testCluster));
+
+        // Verify that partition selection still works correctly after call to onNewBatch.
+        assertEquals(2, partitioner.partition(topicA, null, null, null, null, testCluster));
+        assertEquals(3, partitioner.partition(topicA, null, null, null, null, testCluster));
+        assertEquals(4, partitioner.partition(topicA, null, null, null, null, testCluster));
+        assertEquals(1, partitioner.partition(topicB, null, null, null, null, testCluster));
+        assertEquals(0, partitioner.partition(topicB, null, null, null, null, testCluster));
+    }
 }