apache · Jackie-Jiang · Dec 21, 2021 · Dec 18, 2021 · Dec 18, 2021 · Dec 18, 2021
diff --git a/...c/main/java/org/apache/pinot/core/data/manager/realtime/LLRealtimeSegmentDataManager.java b/...c/main/java/org/apache/pinot/core/data/manager/realtime/LLRealtimeSegmentDataManager.java
@@ -33,7 +33,6 @@
 import java.util.Set;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
 import javax.annotation.Nullable;
@@ -85,7 +84,6 @@
 import org.apache.pinot.spi.stream.StreamMetadataProvider;
 import org.apache.pinot.spi.stream.StreamPartitionMsgOffset;
 import org.apache.pinot.spi.stream.StreamPartitionMsgOffsetFactory;
-import org.apache.pinot.spi.stream.TransientConsumerException;
 import org.apache.pinot.spi.utils.CommonConstants.ConsumerState;
 import org.apache.pinot.spi.utils.CommonConstants.Segment.Realtime.CompletionMode;
 import org.apache.pinot.spi.utils.IngestionConfigUtils;
@@ -401,16 +399,12 @@ protected boolean consumeLoop()
             .fetchMessages(_currentOffset, null, _partitionLevelStreamConfig.getFetchTimeoutMillis());
         _endOfPartitionGroup = messageBatch.isEndOfPartitionGroup();
         _consecutiveErrorCount = 0;
-      } catch (TimeoutException e) {
-        handleTransientStreamErrors(e);
-        continue;
-      } catch (TransientConsumerException e) {
-        handleTransientStreamErrors(e);
-        continue;
       } catch (PermanentConsumerException e) {
         _segmentLogger.warn("Permanent exception from stream when fetching messages, stopping consumption", e);
         throw e;
       } catch (Exception e) {
+        // all exceptions but PermanentConsumerException are handled the same way
+        // can be a TimeoutException or TransientConsumerException routinely
         // Unknown exception from stream. Treat as a transient exception.
         // One such exception seen so far is java.net.SocketTimeoutException
         handleTransientStreamErrors(e);
@@ -423,12 +417,17 @@ protected boolean consumeLoop()
         consecutiveIdleCount = 0;
         // We consumed something. Update the highest stream offset as well as partition-consuming metric.
         // TODO Issue 5359 Need to find a way to bump metrics without getting actual offset value.
-//        _serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.HIGHEST_KAFKA_OFFSET_CONSUMED,
-//        _currentOffset.getOffset());
-//        _serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.HIGHEST_STREAM_OFFSET_CONSUMED,
-//        _currentOffset.getOffset());
+        //_serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.HIGHEST_KAFKA_OFFSET_CONSUMED,
+        //_currentOffset.getOffset());
+        //_serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.HIGHEST_STREAM_OFFSET_CONSUMED,
+        //_currentOffset.getOffset());
         _serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.LLC_PARTITION_CONSUMING, 1);
         lastUpdatedOffset = _streamPartitionMsgOffsetFactory.create(_currentOffset);
+      } else if (messageBatch.getUnfilteredMessageCount() > 0) {
+        // we consumed something from the stream but filtered all the content out,
+        // so we need to advance the offsets to avoid getting stuck
+        _currentOffset = messageBatch.getOffsetOfNextBatch();
+        lastUpdatedOffset = _streamPartitionMsgOffsetFactory.create(_currentOffset);
       } else {
         // We did not consume any rows. Update the partition-consuming metric only if we have been idling for a long
         // time.
@@ -559,7 +558,7 @@ private void processStreamEvents(MessageBatch messagesAndOffsets, long idlePipeS
     if (streamMessageCount != 0) {
       _segmentLogger.debug("Indexed {} messages ({} messages read from stream) current offset {}", indexedMessageCount,
           streamMessageCount, _currentOffset);
-    } else {
+    } else if (messagesAndOffsets.getUnfilteredMessageCount() == 0) {
       // If there were no messages to be fetched from stream, wait for a little bit as to avoid hammering the stream
       Uninterruptibles.sleepUninterruptibly(idlePipeSleepTimeMillis, TimeUnit.MILLISECONDS);
     }

diff --git a/...est-base/src/test/java/org/apache/pinot/integration/tests/BaseClusterIntegrationTest.java b/...est-base/src/test/java/org/apache/pinot/integration/tests/BaseClusterIntegrationTest.java
@@ -473,7 +473,11 @@ protected void pushAvroIntoKafka(List<File> avroFiles)
 
     ClusterIntegrationTestUtils
         .pushAvroIntoKafka(avroFiles, "localhost:" + getKafkaPort(), getKafkaTopic(), getMaxNumKafkaMessagesPerBatch(),
-            getKafkaMessageHeader(), getPartitionColumn());
+            getKafkaMessageHeader(), getPartitionColumn(), injectTombstones());
+  }
+
+  protected boolean injectTombstones() {
+    return false;
   }
 
   protected List<File> getAllAvroFiles()

diff --git a/...st-base/src/test/java/org/apache/pinot/integration/tests/ClusterIntegrationTestUtils.java b/...st-base/src/test/java/org/apache/pinot/integration/tests/ClusterIntegrationTestUtils.java
@@ -317,7 +317,8 @@ public static void buildSegmentFromAvro(File avroFile, TableConfig tableConfig,
    * @throws Exception
    */
   public static void pushAvroIntoKafka(List<File> avroFiles, String kafkaBroker, String kafkaTopic,
-      int maxNumKafkaMessagesPerBatch, @Nullable byte[] header, @Nullable String partitionColumn)
+      int maxNumKafkaMessagesPerBatch, @Nullable byte[] header, @Nullable String partitionColumn,
+      boolean injectTombstones)
       throws Exception {
     Properties properties = new Properties();
     properties.put("metadata.broker.list", kafkaBroker);
@@ -329,6 +330,13 @@ public static void pushAvroIntoKafka(List<File> avroFiles, String kafkaBroker, S
         StreamDataProvider.getStreamDataProducer(KafkaStarterUtils.KAFKA_PRODUCER_CLASS_NAME, properties);
 
     try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream(65536)) {
+      if (injectTombstones) {
+        // publish lots of tombstones to livelock the consumer if it can't handle this properly
+        for (int i = 0; i < 1000; i++) {
+          // publish a tombstone first
+          producer.produce(kafkaTopic, Longs.toByteArray(System.currentTimeMillis()), null);
+        }
+      }
       for (File avroFile : avroFiles) {
         try (DataFileStream<GenericRecord> reader = AvroUtils.getAvroReader(avroFile)) {
           BinaryEncoder binaryEncoder = new EncoderFactory().directBinaryEncoder(outputStream, null);

diff --git a/...s/src/test/java/org/apache/pinot/integration/tests/LLCRealtimeClusterIntegrationTest.java b/...s/src/test/java/org/apache/pinot/integration/tests/LLCRealtimeClusterIntegrationTest.java
@@ -63,6 +63,11 @@ public class LLCRealtimeClusterIntegrationTest extends RealtimeClusterIntegratio
   private final boolean _enableLeadControllerResource = RANDOM.nextBoolean();
   private final long _startTime = System.currentTimeMillis();
 
+  @Override
+  protected boolean injectTombstones() {
+    return true;
+  }
+
   @Override
   protected boolean useLlc() {
     return true;

diff --git a/...not-kafka-2.0/src/main/java/org/apache/pinot/plugin/stream/kafka20/KafkaMessageBatch.java b/...not-kafka-2.0/src/main/java/org/apache/pinot/plugin/stream/kafka20/KafkaMessageBatch.java
@@ -18,10 +18,7 @@
  */
 package org.apache.pinot.plugin.stream.kafka20;
 
-import java.util.ArrayList;
 import java.util.List;
-import org.apache.kafka.clients.consumer.ConsumerRecord;
-import org.apache.kafka.common.utils.Bytes;
 import org.apache.pinot.plugin.stream.kafka.MessageAndOffset;
 import org.apache.pinot.spi.stream.LongMsgOffset;
 import org.apache.pinot.spi.stream.MessageBatch;
@@ -30,19 +27,31 @@
 
 public class KafkaMessageBatch implements MessageBatch<byte[]> {
 
-  private List<MessageAndOffset> _messageList = new ArrayList<>();
+  private final List<MessageAndOffset> _messageList;
+  private final int _unfilteredMessageCount;
+  private final long _lastOffset;
 
-  public KafkaMessageBatch(Iterable<ConsumerRecord<String, Bytes>> iterable) {
-    for (ConsumerRecord<String, Bytes> record : iterable) {
-      _messageList.add(new MessageAndOffset(record.value().get(), record.offset()));
-    }
+  /**
+   * @param unfilteredMessageCount how many messages were received from the topic before being filtered
+   * @param lastOffset the offset of the last message in the batch
+   * @param batch the messages, which may be smaller than {@see unfilteredMessageCount}
+   */
+  public KafkaMessageBatch(int unfilteredMessageCount, long lastOffset, List<MessageAndOffset> batch) {
+    _messageList = batch;
+    _lastOffset = lastOffset;
+    _unfilteredMessageCount = unfilteredMessageCount;
   }
 
   @Override
   public int getMessageCount() {
     return _messageList.size();
   }
 
+  @Override
+  public int getUnfilteredMessageCount() {
+    return _unfilteredMessageCount;
+  }
+
   @Override
   public byte[] getMessageAtIndex(int index) {
     return _messageList.get(index).getMessage().array();
@@ -67,4 +76,9 @@ public long getNextStreamMessageOffsetAtIndex(int index) {
   public StreamPartitionMsgOffset getNextStreamParitionMsgOffsetAtIndex(int index) {
     return new LongMsgOffset(_messageList.get(index).getNextOffset());
   }
+
+  @Override
+  public StreamPartitionMsgOffset getOffsetOfNextBatch() {
+    return new LongMsgOffset(_lastOffset + 1);
+  }
 }
diff --git a/...2.0/src/main/java/org/apache/pinot/plugin/stream/kafka20/KafkaPartitionLevelConsumer.java b/...2.0/src/main/java/org/apache/pinot/plugin/stream/kafka20/KafkaPartitionLevelConsumer.java
@@ -18,61 +18,51 @@
  */
 package org.apache.pinot.plugin.stream.kafka20;
 
-import com.google.common.collect.Iterables;
-import java.io.IOException;
 import java.time.Duration;
+import java.util.ArrayList;
 import java.util.List;
-import java.util.concurrent.TimeoutException;
 import org.apache.kafka.clients.consumer.ConsumerRecord;
 import org.apache.kafka.clients.consumer.ConsumerRecords;
 import org.apache.kafka.common.utils.Bytes;
+import org.apache.pinot.plugin.stream.kafka.MessageAndOffset;
 import org.apache.pinot.spi.stream.LongMsgOffset;
 import org.apache.pinot.spi.stream.MessageBatch;
 import org.apache.pinot.spi.stream.PartitionLevelConsumer;
 import org.apache.pinot.spi.stream.StreamConfig;
 import org.apache.pinot.spi.stream.StreamPartitionMsgOffset;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 
 public class KafkaPartitionLevelConsumer extends KafkaPartitionLevelConnectionHandler
     implements PartitionLevelConsumer {
-  private static final Logger LOGGER = LoggerFactory.getLogger(KafkaPartitionLevelConsumer.class);
 
   public KafkaPartitionLevelConsumer(String clientId, StreamConfig streamConfig, int partition) {
     super(clientId, streamConfig, partition);
   }
 
   @Override
-  public MessageBatch fetchMessages(StreamPartitionMsgOffset startMsgOffset, StreamPartitionMsgOffset endMsgOffset,
-      int timeoutMillis)
-      throws TimeoutException {
+  public MessageBatch<byte[]> fetchMessages(StreamPartitionMsgOffset startMsgOffset,
+      StreamPartitionMsgOffset endMsgOffset, int timeoutMillis) {
     final long startOffset = ((LongMsgOffset) startMsgOffset).getOffset();
     final long endOffset = endMsgOffset == null ? Long.MAX_VALUE : ((LongMsgOffset) endMsgOffset).getOffset();
     return fetchMessages(startOffset, endOffset, timeoutMillis);
   }
 
-  public MessageBatch fetchMessages(long startOffset, long endOffset, int timeoutMillis)
-      throws TimeoutException {
+  public MessageBatch<byte[]> fetchMessages(long startOffset, long endOffset, int timeoutMillis) {
     _consumer.seek(_topicPartition, startOffset);
     ConsumerRecords<String, Bytes> consumerRecords = _consumer.poll(Duration.ofMillis(timeoutMillis));
-    final Iterable<ConsumerRecord<String, Bytes>> messageAndOffsetIterable =
-        buildOffsetFilteringIterable(consumerRecords.records(_topicPartition), startOffset, endOffset);
-    return new KafkaMessageBatch(messageAndOffsetIterable);
-  }
-
-  private Iterable<ConsumerRecord<String, Bytes>> buildOffsetFilteringIterable(
-      final List<ConsumerRecord<String, Bytes>> messageAndOffsets, final long startOffset, final long endOffset) {
-    return Iterables.filter(messageAndOffsets, input -> {
-      // Filter messages that are either null or have an offset ∉ [startOffset, endOffset]
-      return input != null && input.value() != null && input.offset() >= startOffset && (endOffset > input.offset()
-          || endOffset == -1);
-    });
-  }
-
-  @Override
-  public void close()
-      throws IOException {
-    super.close();
+    List<ConsumerRecord<String, Bytes>> messageAndOffsets = consumerRecords.records(_topicPartition);
+    List<MessageAndOffset> filtered = new ArrayList<>(messageAndOffsets.size());
+    long lastOffset = startOffset;
+    for (ConsumerRecord<String, Bytes> messageAndOffset : messageAndOffsets) {
+      Bytes message = messageAndOffset.value();
+      long offset = messageAndOffset.offset();
+      if (offset >= startOffset & (endOffset > offset | endOffset == -1)) {
+        if (message != null) {
+          filtered.add(new MessageAndOffset(message.get(), offset));
+        }
+        lastOffset = offset;
+      }
+    }
+    return new KafkaMessageBatch(messageAndOffsets.size(), lastOffset, filtered);
   }
 }
diff --git a/...sis/src/main/java/org/apache/pinot/plugin/stream/kinesis/KinesisPartitionGroupOffset.java b/...sis/src/main/java/org/apache/pinot/plugin/stream/kinesis/KinesisPartitionGroupOffset.java
@@ -77,7 +77,7 @@ public KinesisPartitionGroupOffset fromString(String kinesisCheckpointStr) {
   }
 
   @Override
-  public int compareTo(Object o) {
+  public int compareTo(StreamPartitionMsgOffset o) {
     Preconditions.checkNotNull(o);
     KinesisPartitionGroupOffset other = (KinesisPartitionGroupOffset) o;
     Preconditions.checkNotNull(other._shardToStartSequenceMap);

diff --git a/...not-pulsar/src/main/java/org/apache/pinot/plugin/stream/pulsar/MessageIdStreamOffset.java b/...not-pulsar/src/main/java/org/apache/pinot/plugin/stream/pulsar/MessageIdStreamOffset.java
@@ -60,7 +60,7 @@ public StreamPartitionMsgOffset fromString(String streamPartitionMsgOffsetStr) {
   }
 
   @Override
-  public int compareTo(Object other) {
+  public int compareTo(StreamPartitionMsgOffset other) {
     MessageIdStreamOffset messageIdStreamOffset = (MessageIdStreamOffset) other;
     return _messageId.compareTo(messageIdStreamOffset.getMessageId());
   }

diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/stream/LongMsgOffset.java b/pinot-spi/src/main/java/org/apache/pinot/spi/stream/LongMsgOffset.java
@@ -42,7 +42,7 @@ public LongMsgOffset(StreamPartitionMsgOffset other) {
   }
 
   @Override
-  public int compareTo(Object other) {
+  public int compareTo(StreamPartitionMsgOffset other) {
     return Long.compare(_offset, ((LongMsgOffset) other)._offset);
   }
 

diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/stream/MessageBatch.java b/pinot-spi/src/main/java/org/apache/pinot/spi/stream/MessageBatch.java
@@ -31,11 +31,17 @@
 @InterfaceStability.Stable
 public interface MessageBatch<T> {
   /**
-   *
-   * @return number of messages returned from the stream
+   * @return number of available messages
    */
   int getMessageCount();
 
+  /**
+   * @return number of messages returned from the stream
+   */
+  default int getUnfilteredMessageCount() {
+    return getMessageCount();
+  }
+
   /**
    * Returns the message at a particular index inside a set of messages returned from the stream.
    * @param index
@@ -82,6 +88,13 @@ default StreamPartitionMsgOffset getNextStreamParitionMsgOffsetAtIndex(int index
     return new LongMsgOffset(getNextStreamMessageOffsetAtIndex(index));
   }
 
+  /**
+   * @return last offset in the batch
+   */
+  default StreamPartitionMsgOffset getOffsetOfNextBatch() {
+    return getNextStreamParitionMsgOffsetAtIndex(getMessageCount() - 1);
+  }
+
   /**
    * Returns true if end of the consumer detects that no more records can be read from this partition group for good
    */

diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/stream/StreamPartitionMsgOffset.java b/pinot-spi/src/main/java/org/apache/pinot/spi/stream/StreamPartitionMsgOffset.java
@@ -40,7 +40,7 @@
  * versions of the stream implementation
  */
 @InterfaceStability.Evolving
-public interface StreamPartitionMsgOffset extends Comparable {
+public interface StreamPartitionMsgOffset extends Comparable<StreamPartitionMsgOffset> {
 
   /**
    *  A serialized representation of the offset object as a String.