Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KAFKA-16143: New JMX metrics for AsyncKafkaConsumer #17199

Merged
merged 1 commit into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
import org.apache.kafka.clients.consumer.internals.events.TopicSubscriptionChangeEvent;
import org.apache.kafka.clients.consumer.internals.events.UnsubscribeEvent;
import org.apache.kafka.clients.consumer.internals.events.UpdatePatternSubscriptionEvent;
import org.apache.kafka.clients.consumer.internals.metrics.KafkaConsumerMetrics;
import org.apache.kafka.clients.consumer.internals.metrics.AsyncConsumerMetrics;
import org.apache.kafka.clients.consumer.internals.metrics.RebalanceCallbackMetricsManager;
import org.apache.kafka.common.Cluster;
import org.apache.kafka.common.IsolationLevel;
Expand Down Expand Up @@ -109,7 +109,6 @@
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
Expand Down Expand Up @@ -215,10 +214,11 @@ private void process(final ConsumerRebalanceListenerCallbackNeededEvent event) {
private final ApplicationEventHandler applicationEventHandler;
private final Time time;
private final AtomicReference<Optional<ConsumerGroupMetadata>> groupMetadata = new AtomicReference<>(Optional.empty());
private final KafkaConsumerMetrics kafkaConsumerMetrics;
private final AsyncConsumerMetrics kafkaConsumerMetrics;
private Logger log;
private final String clientId;
private final BlockingQueue<BackgroundEvent> backgroundEventQueue;
private final BackgroundEventHandler backgroundEventHandler;
private final BackgroundEventProcessor backgroundEventProcessor;
private final CompletableEventReaper backgroundEventReaper;
private final Deserializers<K, V> deserializers;
Expand Down Expand Up @@ -320,6 +320,7 @@ public void onGroupAssignmentUpdated(Set<TopicPartition> partitions) {
this.clientTelemetryReporter = CommonClientConfigs.telemetryReporter(clientId, config);
this.clientTelemetryReporter.ifPresent(reporters::add);
this.metrics = createMetrics(config, time, reporters);
this.kafkaConsumerMetrics = new AsyncConsumerMetrics(metrics);
this.retryBackoffMs = config.getLong(ConsumerConfig.RETRY_BACKOFF_MS_CONFIG);

List<ConsumerInterceptor<K, V>> interceptorList = configuredConsumerInterceptors(config);
Expand All @@ -339,7 +340,11 @@ public void onGroupAssignmentUpdated(Set<TopicPartition> partitions) {

ApiVersions apiVersions = new ApiVersions();
final BlockingQueue<ApplicationEvent> applicationEventQueue = new LinkedBlockingQueue<>();
final BackgroundEventHandler backgroundEventHandler = new BackgroundEventHandler(backgroundEventQueue);
this.backgroundEventHandler = new BackgroundEventHandler(
backgroundEventQueue,
time,
kafkaConsumerMetrics
);

// This FetchBuffer is shared between the application and network threads.
this.fetchBuffer = new FetchBuffer(logContext);
Expand All @@ -352,7 +357,9 @@ public void onGroupAssignmentUpdated(Set<TopicPartition> partitions) {
fetchMetricsManager.throttleTimeSensor(),
clientTelemetryReporter.map(ClientTelemetryReporter::telemetrySender).orElse(null),
backgroundEventHandler,
false);
false,
kafkaConsumerMetrics
);
this.offsetCommitCallbackInvoker = new OffsetCommitCallbackInvoker(interceptors);
this.groupMetadata.set(initializeGroupMetadata(config, groupRebalanceConfig));
final Supplier<RequestManagers> requestManagersSupplier = RequestManagers.supplier(time,
Expand Down Expand Up @@ -382,7 +389,9 @@ public void onGroupAssignmentUpdated(Set<TopicPartition> partitions) {
new CompletableEventReaper(logContext),
applicationEventProcessorSupplier,
networkClientDelegateSupplier,
requestManagersSupplier);
requestManagersSupplier,
kafkaConsumerMetrics
);

this.rebalanceListenerInvoker = new ConsumerRebalanceListenerInvoker(
logContext,
Expand All @@ -402,8 +411,6 @@ public void onGroupAssignmentUpdated(Set<TopicPartition> partitions) {
fetchMetricsManager,
time);

this.kafkaConsumerMetrics = new KafkaConsumerMetrics(metrics, CONSUMER_METRIC_GROUP_PREFIX);

if (groupMetadata.get().isPresent() &&
GroupProtocol.of(config.getString(ConsumerConfig.GROUP_PROTOCOL_CONFIG)) == GroupProtocol.CONSUMER) {
config.ignore(ConsumerConfig.GROUP_REMOTE_ASSIGNOR_CONFIG); // Used by background thread
Expand Down Expand Up @@ -460,10 +467,15 @@ public void onGroupAssignmentUpdated(Set<TopicPartition> partitions) {
this.defaultApiTimeoutMs = Duration.ofMillis(defaultApiTimeoutMs);
this.deserializers = deserializers;
this.applicationEventHandler = applicationEventHandler;
this.kafkaConsumerMetrics = new KafkaConsumerMetrics(metrics, "consumer");
this.kafkaConsumerMetrics = new AsyncConsumerMetrics(metrics);
this.clientTelemetryReporter = Optional.empty();
this.autoCommitEnabled = autoCommitEnabled;
this.offsetCommitCallbackInvoker = new OffsetCommitCallbackInvoker(interceptors);
this.backgroundEventHandler = new BackgroundEventHandler(
backgroundEventQueue,
time,
kafkaConsumerMetrics
);
}

AsyncKafkaConsumer(LogContext logContext,
Expand Down Expand Up @@ -498,7 +510,7 @@ public void onGroupAssignmentUpdated(Set<TopicPartition> partitions) {
deserializers,
fetchMetricsManager,
time);
this.kafkaConsumerMetrics = new KafkaConsumerMetrics(metrics, "consumer");
this.kafkaConsumerMetrics = new AsyncConsumerMetrics(metrics);

GroupRebalanceConfig groupRebalanceConfig = new GroupRebalanceConfig(
config,
Expand All @@ -509,7 +521,11 @@ public void onGroupAssignmentUpdated(Set<TopicPartition> partitions) {

BlockingQueue<ApplicationEvent> applicationEventQueue = new LinkedBlockingQueue<>();
this.backgroundEventQueue = new LinkedBlockingQueue<>();
BackgroundEventHandler backgroundEventHandler = new BackgroundEventHandler(backgroundEventQueue);
this.backgroundEventHandler = new BackgroundEventHandler(
backgroundEventQueue,
time,
kafkaConsumerMetrics
);
this.rebalanceListenerInvoker = new ConsumerRebalanceListenerInvoker(
logContext,
subscriptions,
Expand All @@ -524,7 +540,8 @@ public void onGroupAssignmentUpdated(Set<TopicPartition> partitions) {
client,
metadata,
backgroundEventHandler,
false
false,
kafkaConsumerMetrics
);
this.offsetCommitCallbackInvoker = new OffsetCommitCallbackInvoker(interceptors);
Supplier<RequestManagers> requestManagersSupplier = RequestManagers.supplier(
Expand Down Expand Up @@ -556,7 +573,8 @@ public void onGroupAssignmentUpdated(Set<TopicPartition> partitions) {
new CompletableEventReaper(logContext),
applicationEventProcessorSupplier,
networkClientDelegateSupplier,
requestManagersSupplier);
requestManagersSupplier,
kafkaConsumerMetrics);
this.backgroundEventProcessor = new BackgroundEventProcessor();
this.backgroundEventReaper = new CompletableEventReaper(logContext);
}
Expand All @@ -571,7 +589,8 @@ ApplicationEventHandler build(
final CompletableEventReaper applicationEventReaper,
final Supplier<ApplicationEventProcessor> applicationEventProcessorSupplier,
final Supplier<NetworkClientDelegate> networkClientDelegateSupplier,
final Supplier<RequestManagers> requestManagersSupplier
final Supplier<RequestManagers> requestManagersSupplier,
final AsyncConsumerMetrics asyncConsumerMetrics
);

}
Expand Down Expand Up @@ -1941,25 +1960,30 @@ private void subscribeInternal(Collection<String> topics, Optional<ConsumerRebal
* It is possible that {@link ErrorEvent an error}
* could occur when processing the events. In such cases, the processor will take a reference to the first
* error, continue to process the remaining events, and then throw the first error that occurred.
*
* Visible for testing.
*/
private boolean processBackgroundEvents() {
boolean processBackgroundEvents() {
AtomicReference<KafkaException> firstError = new AtomicReference<>();

LinkedList<BackgroundEvent> events = new LinkedList<>();
backgroundEventQueue.drainTo(events);

for (BackgroundEvent event : events) {
try {
if (event instanceof CompletableEvent)
backgroundEventReaper.add((CompletableEvent<?>) event);

backgroundEventProcessor.process(event);
} catch (Throwable t) {
KafkaException e = ConsumerUtils.maybeWrapAsKafkaException(t);

if (!firstError.compareAndSet(null, e))
log.warn("An error occurred when processing the background event: {}", e.getMessage(), e);
List<BackgroundEvent> events = backgroundEventHandler.drainEvents();
if (!events.isEmpty()) {
long startMs = time.milliseconds();
for (BackgroundEvent event : events) {
kafkaConsumerMetrics.recordBackgroundEventQueueTime(time.milliseconds() - event.enqueuedMs());
try {
if (event instanceof CompletableEvent)
backgroundEventReaper.add((CompletableEvent<?>) event);

backgroundEventProcessor.process(event);
} catch (Throwable t) {
KafkaException e = ConsumerUtils.maybeWrapAsKafkaException(t);

if (!firstError.compareAndSet(null, e))
log.warn("An error occurred when processing the background event: {}", e.getMessage(), e);
}
}
kafkaConsumerMetrics.recordBackgroundEventQueueProcessingTime(time.milliseconds() - startMs);
}

backgroundEventReaper.reap(time.milliseconds());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How are we going to account for events which expired and are removed from the queue by the event reaper? They probably ought to be included in the metrics.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we don't have this metric in original KIP. Do we want to add one?

https://cwiki.apache.org/confluence/display/KAFKA/KIP-1068%3A+New+metrics+for+the+new+KafkaConsumer

Copy link
Member

@lianetm lianetm Dec 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, and if we agree on what we want we could just send an update in the KIP email thread to add it to the KIP and here.

To align internally first, I guess we would be interested in the num/avg of expired events, but we need to consider how that metric would go crazy and be a false alarm in cases like poll(0) right? Should we consider the expiration relevant only if there was a non-zero timeout? Thoughts?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like the only background event which has deadline is ConsumerRebalanceListenerCallbackNeededEvent, but its deadline is Long.MAX_VALUE, so it will never be expired. Do we want to update it? If yes, I can create a Jira for it. If not, we may not need to add this metric. WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree that it's not applicable for background events because this callback needed is the only CompletableBackgroundEvent, and it's intentionally not expired (I don't think we need to change that)

I would say that what might be interesting to know is expiration of Application events.

There we do have lots of events with deadline, I guess that's what @AndrewJSchofield had in mind maybe? (I notice now that the initial comment was here on the reap of background events).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the background event reaper triggered my thought, but @lianetm is correct. The interesting part is the events with deadlines that are timing out.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a new metric application-event-expired-size. After we all agree this metric, I will send a mail to original vote thread to mention the new metric.

I also change CompletableEventReaper#reap to return expired event count, so we can record the value. Thanks.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this sounds like a useful metric to have. Thanks!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense to me too. I would just suggest we name it application-events-expired-count.

Expand Down Expand Up @@ -2088,7 +2112,7 @@ public Metrics metricsRegistry() {
}

@Override
public KafkaConsumerMetrics kafkaConsumerMetrics() {
public AsyncConsumerMetrics kafkaConsumerMetrics() {
return kafkaConsumerMetrics;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.kafka.clients.consumer.internals.events.CompletableApplicationEvent;
import org.apache.kafka.clients.consumer.internals.events.CompletableEvent;
import org.apache.kafka.clients.consumer.internals.events.CompletableEventReaper;
import org.apache.kafka.clients.consumer.internals.metrics.AsyncConsumerMetrics;
import org.apache.kafka.common.internals.IdempotentCloser;
import org.apache.kafka.common.requests.AbstractRequest;
import org.apache.kafka.common.utils.KafkaThread;
Expand Down Expand Up @@ -62,21 +63,24 @@ public class ConsumerNetworkThread extends KafkaThread implements Closeable {
private final Supplier<ApplicationEventProcessor> applicationEventProcessorSupplier;
private final Supplier<NetworkClientDelegate> networkClientDelegateSupplier;
private final Supplier<RequestManagers> requestManagersSupplier;
private final AsyncConsumerMetrics asyncConsumerMetrics;
private ApplicationEventProcessor applicationEventProcessor;
private NetworkClientDelegate networkClientDelegate;
private RequestManagers requestManagers;
private volatile boolean running;
private final IdempotentCloser closer = new IdempotentCloser();
private volatile Duration closeTimeout = Duration.ofMillis(DEFAULT_CLOSE_TIMEOUT_MS);
private volatile long cachedMaximumTimeToWait = MAX_POLL_TIMEOUT_MS;
private long lastPollTimeMs = 0L;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is only ever written to and read from the same thread, right?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pinging on this. I believe it's only written on the background thread, but just want to be sure.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @kirktrue, this is only used for metrics time-between-network-thread-poll-max and time-between-network-thread-poll-avg. Thanks.


public ConsumerNetworkThread(LogContext logContext,
Time time,
BlockingQueue<ApplicationEvent> applicationEventQueue,
CompletableEventReaper applicationEventReaper,
Supplier<ApplicationEventProcessor> applicationEventProcessorSupplier,
Supplier<NetworkClientDelegate> networkClientDelegateSupplier,
Supplier<RequestManagers> requestManagersSupplier) {
Supplier<RequestManagers> requestManagersSupplier,
AsyncConsumerMetrics asyncConsumerMetrics) {
super(BACKGROUND_THREAD_NAME, true);
this.time = time;
this.log = logContext.logger(getClass());
Expand All @@ -86,6 +90,7 @@ public ConsumerNetworkThread(LogContext logContext,
this.networkClientDelegateSupplier = networkClientDelegateSupplier;
this.requestManagersSupplier = requestManagersSupplier;
this.running = true;
this.asyncConsumerMetrics = asyncConsumerMetrics;
}

@Override
Expand Down Expand Up @@ -141,6 +146,11 @@ void runOnce() {
processApplicationEvents();

final long currentTimeMs = time.milliseconds();
if (lastPollTimeMs != 0L) {
asyncConsumerMetrics.recordTimeBetweenNetworkThreadPoll(currentTimeMs - lastPollTimeMs);
}
lastPollTimeMs = currentTimeMs;

final long pollWaitTimeMs = requestManagers.entries().stream()
.filter(Optional::isPresent)
.map(Optional::get)
Expand All @@ -166,8 +176,13 @@ void runOnce() {
private void processApplicationEvents() {
LinkedList<ApplicationEvent> events = new LinkedList<>();
applicationEventQueue.drainTo(events);
if (events.isEmpty())
return;

asyncConsumerMetrics.recordApplicationEventQueueSize(0);
long startMs = time.milliseconds();
for (ApplicationEvent event : events) {
asyncConsumerMetrics.recordApplicationEventQueueTime(time.milliseconds() - event.enqueuedMs());
try {
if (event instanceof CompletableEvent) {
applicationEventReaper.add((CompletableEvent<?>) event);
Expand All @@ -181,6 +196,7 @@ private void processApplicationEvents() {
log.warn("Error processing event {}", t.getMessage(), t);
}
}
asyncConsumerMetrics.recordApplicationEventQueueProcessingTime(time.milliseconds() - startMs);
}

/**
Expand All @@ -189,7 +205,7 @@ private void processApplicationEvents() {
* is given least one attempt to satisfy any network requests <em>before</em> checking if a timeout has expired.
*/
private void reapExpiredApplicationEvents(long currentTimeMs) {
applicationEventReaper.reap(currentTimeMs);
asyncConsumerMetrics.recordApplicationEventExpiredSize(applicationEventReaper.reap(currentTimeMs));
}

/**
Expand Down Expand Up @@ -326,7 +342,7 @@ void cleanup() {
log.error("Unexpected error during shutdown. Proceed with closing.", e);
} finally {
sendUnsentRequests(timer);
applicationEventReaper.reap(applicationEventQueue);
asyncConsumerMetrics.recordApplicationEventExpiredSize(applicationEventReaper.reap(applicationEventQueue));

closeQuietly(requestManagers, "request managers");
closeQuietly(networkClientDelegate, "network client delegate");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ public final class ConsumerUtils {
public static final String CONSUMER_SHARE_METRIC_GROUP_PREFIX = "consumer-share";
public static final String COORDINATOR_METRICS_SUFFIX = "-coordinator-metrics";
public static final String CONSUMER_METRICS_SUFFIX = "-metrics";
public static final String CONSUMER_METRIC_GROUP = CONSUMER_METRIC_GROUP_PREFIX + CONSUMER_METRICS_SUFFIX;

/**
* A fixed, large enough value will suffice for max.
Expand Down
Loading
Loading