Kafka 源码解析

December 9, 2019 · 13 min read

Sr Soft Engineer

Kafka Producer Send Message

//kafka cluent 通过 send api 发送message
public Future<RecordMetadata> send(ProducerRecord<K, V> record, Callback callback) {
    // intercept the record, which can be potentially modified; this method does not throw exceptions
    ProducerRecord<K, V> interceptedRecord = this.interceptors.onSend(record);
    return doSend(interceptedRecord, callback);
}

/**kafka 异步发送数据实现 kafka 默认async 异步发送数据
 * Implementation of asynchronously send a record to a topic.
 */
private Future<RecordMetadata> doSend(ProducerRecord<K, V> record, Callback callback) {
    TopicPartition tp = null;
    try {
        throwIfProducerClosed();
        // first make sure the metadata for the topic is available
        ClusterAndWaitTime clusterAndWaitTime;
        try {
            clusterAndWaitTime = waitOnMetadata(record.topic(), record.partition(), maxBlockTimeMs);
        } catch (KafkaException e) {
            if (metadata.isClosed())
                throw new KafkaException("Producer closed while send in progress", e);
            throw e;
        }
        long remainingWaitMs = Math.max(0, maxBlockTimeMs - clusterAndWaitTime.waitedOnMetadataMs);
        Cluster cluster = clusterAndWaitTime.cluster;
        byte[] serializedKey;
        try {
            serializedKey = keySerializer.serialize(record.topic(), record.headers(), record.key());
        } catch (ClassCastException cce) {
            throw new SerializationException("Can't convert key of class " + record.key().getClass().getName() +
                    " to class " + producerConfig.getClass(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG).getName() +
                    " specified in key.serializer", cce);
        }
        byte[] serializedValue;
        try {
            serializedValue = valueSerializer.serialize(record.topic(), record.headers(), record.value());
        } catch (ClassCastException cce) {
            throw new SerializationException("Can't convert value of class " + record.value().getClass().getName() +
                    " to class " + producerConfig.getClass(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG).getName() +
                    " specified in value.serializer", cce);
        }
        //计算数据所在分区 若未指定分区生成策略 使用DefaultPartitioner 
        //Key不为空 keyBytes hash % availablePartitions
        //Key为空 内置AtomicInteger 通过getAndIncrement() % availablePartitions 均分每个
        int partition = partition(record, serializedKey, serializedValue, cluster);
        tp = new TopicPartition(record.topic(), partition);

        setReadOnly(record.headers());
        Header[] headers = record.headers().toArray();
        //估算消息的大小
        int serializedSize = AbstractRecords.estimateSizeInBytesUpperBound(apiVersions.maxUsableProduceMagic(),
                compressionType, serializedKey, serializedValue, headers);
        ensureValidRecordSize(serializedSize);
        long timestamp = record.timestamp() == null ? time.milliseconds() : record.timestamp();
        log.trace("Sending record {} with callback {} to topic {} partition {}", record, callback, record.topic(), partition);
        // producer callback will make sure to call both 'callback' and interceptor callback
        Callback interceptCallback = new InterceptorCallback<>(callback, this.interceptors, tp);

        if (transactionManager != null && transactionManager.isTransactional())
            transactionManager.maybeAddPartitionToTransaction(tp);
        //加入队列容器 Sender后续会从队列拿数据发送
        RecordAccumulator.RecordAppendResult result = accumulator.append(tp, timestamp, serializedKey,
                serializedValue, headers, interceptCallback, remainingWaitMs);
        if (result.batchIsFull || result.newBatchCreated) { //如果队列已满 或者新队列
            log.trace("Waking up the sender since topic {} partition {} is either full or getting a new batch", record.topic(), partition);
            this.sender.wakeup();
        }
        return result.future;
        // handling exceptions and record the errors;
        // for API exceptions return them in the future,
        // for other exceptions throw directly
    } catch (ApiException e) {
        log.debug("Exception occurred during message send:", e);
        if (callback != null)
            callback.onCompletion(null, e);
        this.errors.record();
        this.interceptors.onSendError(record, tp, e);
        return new FutureFailure(e);
    } catch (InterruptedException e) {
        this.errors.record();
        this.interceptors.onSendError(record, tp, e);
        throw new InterruptException(e);
    } catch (BufferExhaustedException e) {
        this.errors.record();
        this.metrics.sensor("buffer-exhausted-records").record();
        this.interceptors.onSendError(record, tp, e);
        throw e;
    } catch (KafkaException e) {
        this.errors.record();
        this.interceptors.onSendError(record, tp, e);
        throw e;
    } catch (Exception e) {
        // we notify interceptor about all exceptions, since onSend is called before anything else in this method
        this.interceptors.onSendError(record, tp, e);
        throw e;
    }
}

//accumulator.append 流程
//RecordAccumulator 内batches 是producer 内缓存message 
//private final ConcurrentMap<TopicPartition, Deque<ProducerBatch>> batches;
//以topic + partition 计算hashcode  value 是batch message 队列

public RecordAppendResult append(TopicPartition tp,
                                 long timestamp,
                                 byte[] key,
                                 byte[] value,
                                 Header[] headers,
                                 Callback callback,
                                 long maxTimeToBlock) throws InterruptedException {
    // We keep track of the number of appending thread to make sure we do not miss batches in
    // abortIncompleteBatches().
    appendsInProgress.incrementAndGet();
    ByteBuffer buffer = null;
    if (headers == null) headers = Record.EMPTY_HEADERS;
    try {
        //从缓存中根据messgage topic partition 拿到队列 
        Deque<ProducerBatch> dq = getOrCreateDeque(tp);
        synchronized (dq) {
            if (closed)
                throw new KafkaException("Producer closed while send in progress");
            RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callback, dq);
            if (appendResult != null)
                return appendResult;
        }

        // we don't have an in-progress record batch try to allocate a new batch
        byte maxUsableMagic = apiVersions.maxUsableProduceMagic();
        //估算数据所需空间
        int size = Math.max(this.batchSize, AbstractRecords.estimateSizeInBytesUpperBound(maxUsableMagic, compression, key, value, headers));
        log.trace("Allocating a new {} byte message buffer for topic {} partition {}", size, tp.topic(), tp.partition());
        //申请缓存
        buffer = free.allocate(size, maxTimeToBlock);
        synchronized (dq) {
            // Need to check if producer is closed again after grabbing the dequeue lock.
            if (closed)
                throw new KafkaException("Producer closed while send in progress");

            RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callback, dq);
            if (appendResult != null) {
                // Somebody else found us a batch, return the one we waited for! Hopefully this doesn't happen often...
                return appendResult;
            }

            MemoryRecordsBuilder recordsBuilder = recordsBuilder(buffer, maxUsableMagic);
            ProducerBatch batch = new ProducerBatch(tp, recordsBuilder, time.milliseconds());
            //ProducerBatch 追加数据
            FutureRecordMetadata future = Utils.notNull(batch.tryAppend(timestamp, key, value, headers, callback, time.milliseconds()));

            //加入队列
            dq.addLast(batch);
            incomplete.add(batch);

            // Don't deallocate this buffer in the finally block as it's being used in the record batch
            buffer = null;
            return new RecordAppendResult(future, dq.size() > 1 || batch.isFull(), true);
        }
    } finally {
        //资源回收
        if (buffer != null)
            free.deallocate(buffer);
        appendsInProgress.decrementAndGet();
    }
}
//至此本地数据封装缓存完毕

//发送数据流程 
//Kafka 内 private final Sender sender;负责将RecordAccumulator batches 上面添加的数据发送到指定kafka broker
//Sender 实现Runnable 
// main loop, runs until close is called 循环发送数据 直到关闭
while (running) {
    try {
        runOnce();
    } catch (Exception e) {
        log.error("Uncaught error in kafka producer I/O thread: ", e);
    }
}

/**
 * Run a single iteration of sending
 *
 */
void runOnce() {
    if (transactionManager != null) {
        try {
            transactionManager.resetProducerIdIfNeeded();

            if (!transactionManager.isTransactional()) {
                // this is an idempotent producer, so make sure we have a producer id
                maybeWaitForProducerId();
            } else if (transactionManager.hasUnresolvedSequences() && !transactionManager.hasFatalError()) {
                transactionManager.transitionToFatalError(
                    new KafkaException("The client hasn't received acknowledgment for " +
                        "some previously sent messages and can no longer retry them. It isn't safe to continue."));
            } else if (maybeSendAndPollTransactionalRequest()) {
                return;
            }

            // do not continue sending if the transaction manager is in a failed state or if there
            // is no producer id (for the idempotent case).
            if (transactionManager.hasFatalError() || !transactionManager.hasProducerId()) {
                RuntimeException lastError = transactionManager.lastError();
                if (lastError != null)
                    maybeAbortBatches(lastError);
                client.poll(retryBackoffMs, time.milliseconds());
                return;
            } else if (transactionManager.hasAbortableError()) {
                accumulator.abortUndrainedBatches(transactionManager.lastError());
            }
        } catch (AuthenticationException e) {
            // This is already logged as error, but propagated here to perform any clean ups.
            log.trace("Authentication exception while processing transactional request: {}", e);
            transactionManager.authenticationFailed(e);
        }
    }

    long currentTimeMs = time.milliseconds();
  	//发送producer data
    long pollTimeout = sendProducerData(currentTimeMs);
    client.poll(pollTimeout, currentTimeMs);
}

private long sendProducerData(long now) {
    Cluster cluster = metadata.fetch();//通过 MetadataRequest 获取集群元数据信息
    // get the list of partitions with data ready to send
    RecordAccumulator.ReadyCheckResult result = this.accumulator.ready(cluster, now);

    // if there are any partitions whose leaders are not known yet, force metadata update
    if (!result.unknownLeaderTopics.isEmpty()) {
        // The set of topics with unknown leader contains topics with leader election pending as well as
        // topics which may have expired. Add the topic again to metadata to ensure it is included
        // and request metadata update, since there are messages to send to the topic.
        for (String topic : result.unknownLeaderTopics)
            this.metadata.add(topic);

        log.debug("Requesting metadata update due to unknown leader topics from the batched records: {}",
            result.unknownLeaderTopics);
        this.metadata.requestUpdate();
    }

    // remove any nodes we aren't ready to send to
    Iterator<Node> iter = result.readyNodes.iterator();
    long notReadyTimeout = Long.MAX_VALUE;
    while (iter.hasNext()) {
        Node node = iter.next();
        if (!this.client.ready(node, now)) {
            iter.remove();
            notReadyTimeout = Math.min(notReadyTimeout, this.client.pollDelayMs(node, now));
        }
    }

    // create produce requests 
    //将batches 内批量message 从metaData中 获取对应header节点  key broker_id value batch data
    Map<Integer, List<ProducerBatch>> batches = this.accumulator.drain(cluster, result.readyNodes, this.maxRequestSize, now);
    addToInflightBatches(batches);
    if (guaranteeMessageOrder) {
        // Mute all the partitions drained
        for (List<ProducerBatch> batchList : batches.values()) {
            for (ProducerBatch batch : batchList)
                this.accumulator.mutePartition(batch.topicPartition);
        }
    }

    accumulator.resetNextBatchExpiryTime();
    List<ProducerBatch> expiredInflightBatches = getExpiredInflightBatches(now);
    List<ProducerBatch> expiredBatches = this.accumulator.expiredBatches(now);
    expiredBatches.addAll(expiredInflightBatches);

    // Reset the producer id if an expired batch has previously been sent to the broker. Also update the metrics
    // for expired batches. see the documentation of @TransactionState.resetProducerId to understand why
    // we need to reset the producer id here.
    if (!expiredBatches.isEmpty())
        log.trace("Expired {} batches in accumulator", expiredBatches.size());
    for (ProducerBatch expiredBatch : expiredBatches) {
        String errorMessage = "Expiring " + expiredBatch.recordCount + " record(s) for " + expiredBatch.topicPartition
            + ":" + (now - expiredBatch.createdMs) + " ms has passed since batch creation";
        failBatch(expiredBatch, -1, NO_TIMESTAMP, new TimeoutException(errorMessage), false);
        if (transactionManager != null && expiredBatch.inRetry()) {
            // This ensures that no new batches are drained until the current in flight batches are fully resolved.
            transactionManager.markSequenceUnresolved(expiredBatch.topicPartition);
        }
    }
    sensors.updateProduceRequestMetrics(batches);

    // If we have any nodes that are ready to send + have sendable data, poll with 0 timeout so this can immediately
    // loop and try sending more data. Otherwise, the timeout will be the smaller value between next batch expiry
    // time, and the delay time for checking data availability. Note that the nodes may have data that isn't yet
    // sendable due to lingering, backing off, etc. This specifically does not include nodes with sendable data
    // that aren't ready to send since they would cause busy looping.
    long pollTimeout = Math.min(result.nextReadyCheckDelayMs, notReadyTimeout);
    pollTimeout = Math.min(pollTimeout, this.accumulator.nextExpiryTimeMs() - now);
    pollTimeout = Math.max(pollTimeout, 0);
    if (!result.readyNodes.isEmpty()) {
        log.trace("Nodes with data ready to send: {}", result.readyNodes);
        // if some partitions are already ready to be sent, the select time would be 0;
        // otherwise if some partition already has some data accumulated but not ready yet,
        // the select time will be the time difference between now and its linger expiry time;
        // otherwise the select time will be the time difference between now and the metadata expiry time;
        pollTimeout = 0;
    }
    //将 batches 数据封装ProduceRequest 发送对应leader
    sendProduceRequests(batches, now);
    return pollTimeout;
}

//封装ProduceRequest 然后通过network client 发送对应 leader
private void sendProduceRequest(long now, int destination, short acks, int timeout, List<ProducerBatch> batches) {
    if (batches.isEmpty())
        return;

    Map<TopicPartition, MemoryRecords> produceRecordsByPartition = new HashMap<>(batches.size());
    final Map<TopicPartition, ProducerBatch> recordsByPartition = new HashMap<>(batches.size());

    // find the minimum magic version used when creating the record sets
    byte minUsedMagic = apiVersions.maxUsableProduceMagic();
    for (ProducerBatch batch : batches) {
        if (batch.magic() < minUsedMagic)
            minUsedMagic = batch.magic();
    }

    for (ProducerBatch batch : batches) {
        TopicPartition tp = batch.topicPartition;
        MemoryRecords records = batch.records();

        // down convert if necessary to the minimum magic used. In general, there can be a delay between the time
        // that the producer starts building the batch and the time that we send the request, and we may have
        // chosen the message format based on out-dated metadata. In the worst case, we optimistically chose to use
        // the new message format, but found that the broker didn't support it, so we need to down-convert on the
        // client before sending. This is intended to handle edge cases around cluster upgrades where brokers may
        // not all support the same message format version. For example, if a partition migrates from a broker
        // which is supporting the new magic version to one which doesn't, then we will need to convert.
        if (!records.hasMatchingMagic(minUsedMagic))
            records = batch.records().downConvert(minUsedMagic, 0, time).records();
        produceRecordsByPartition.put(tp, records);
        recordsByPartition.put(tp, batch);
    }

    String transactionalId = null;
    if (transactionManager != null && transactionManager.isTransactional()) {
        transactionalId = transactionManager.transactionalId();
    }
    ProduceRequest.Builder requestBuilder = ProduceRequest.Builder.forMagic(minUsedMagic, acks, timeout,
            produceRecordsByPartition, transactionalId);
    RequestCompletionHandler callback = new RequestCompletionHandler() {
        public void onComplete(ClientResponse response) {
            handleProduceResponse(response, recordsByPartition, time.milliseconds());
        }
    };

    String nodeId = Integer.toString(destination);
    ClientRequest clientRequest = client.newClientRequest(nodeId, requestBuilder, now, acks != 0,
            requestTimeoutMs, callback);
    client.send(clientRequest, now);
    log.trace("Sent produce request to {}: {}", nodeId, requestBuilder);
}

producer 发送数据流程如下

1.sendMessage 根据分区策略计算分区根据compress 封装MemoryRecordsBuilder 生成MemoryRecord

2.MemoryRecordsBuilder 生成ProducerBatch 加入RecordAccumulator batchs 队列

3.通过MetadataRequest 获取集群元数据信息 Sender 线程将batchs 数据 topic+partition 生成对应到该分区leader节点的 ProducerRequest 请求最终通过Network Client发送

Kafka Broker Recive Message( hanle ProducerRequest)

//处理producer Request
def handleProduceRequest(request: RequestChannel.Request): Unit = {
  val produceRequest = request.body[ProduceRequest]//request 请求主体 ack 信息 memory record (messageData)
  val numBytesAppended = request.header.toStruct.sizeOf + request.sizeOfBodyInBytes

  if (produceRequest.hasTransactionalRecords) {
    val isAuthorizedTransactional = produceRequest.transactionalId != null &&
      authorize(request, WRITE, TRANSACTIONAL_ID, produceRequest.transactionalId)
    if (!isAuthorizedTransactional) {
      sendErrorResponseMaybeThrottle(request, Errors.TRANSACTIONAL_ID_AUTHORIZATION_FAILED.exception)
      return
    }
    // Note that authorization to a transactionalId implies ProducerId authorization

  } else if (produceRequest.hasIdempotentRecords && !authorize(request, IDEMPOTENT_WRITE, CLUSTER, CLUSTER_NAME)) {
    sendErrorResponseMaybeThrottle(request, Errors.CLUSTER_AUTHORIZATION_FAILED.exception)
    return
  }

  val unauthorizedTopicResponses = mutable.Map[TopicPartition, PartitionResponse]()
  val nonExistingTopicResponses = mutable.Map[TopicPartition, PartitionResponse]()
  val invalidRequestResponses = mutable.Map[TopicPartition, PartitionResponse]()
  val authorizedRequestInfo = mutable.Map[TopicPartition, MemoryRecords]()//通过权限认证的MemoryRecords
  val authorizedTopics = filterAuthorized(request, WRITE, TOPIC,
    produceRequest.partitionRecordsOrFail.asScala.toSeq.map(_._1.topic))

  for ((topicPartition, memoryRecords) <- produceRequest.partitionRecordsOrFail.asScala) {
    if (!authorizedTopics.contains(topicPartition.topic))
      unauthorizedTopicResponses += topicPartition -> new PartitionResponse(Errors.TOPIC_AUTHORIZATION_FAILED)
    else if (!metadataCache.contains(topicPartition))
      nonExistingTopicResponses += topicPartition -> new PartitionResponse(Errors.UNKNOWN_TOPIC_OR_PARTITION)
    else
      try {
        ProduceRequest.validateRecords(request.header.apiVersion(), memoryRecords)
        authorizedRequestInfo += (topicPartition -> memoryRecords)
      } catch {
        case e: ApiException =>
          invalidRequestResponses += topicPartition -> new PartitionResponse(Errors.forException(e))
      }
  }

  // the callback for sending a produce response
  def sendResponseCallback(responseStatus: Map[TopicPartition, PartitionResponse]): Unit = {
    val mergedResponseStatus = responseStatus ++ unauthorizedTopicResponses ++ nonExistingTopicResponses ++ invalidRequestResponses
    var errorInResponse = false

    mergedResponseStatus.foreach { case (topicPartition, status) =>
      if (status.error != Errors.NONE) {
        errorInResponse = true
        debug("Produce request with correlation id %d from client %s on partition %s failed due to %s".format(
          request.header.correlationId,
          request.header.clientId,
          topicPartition,
          status.error.exceptionName))
      }
    }

    // When this callback is triggered, the remote API call has completed
    request.apiRemoteCompleteTimeNanos = time.nanoseconds

    // Record both bandwidth and request quota-specific values and throttle by muting the channel if any of the quotas
    // have been violated. If both quotas have been violated, use the max throttle time between the two quotas. Note
    // that the request quota is not enforced if acks == 0.
    val bandwidthThrottleTimeMs = quotas.produce.maybeRecordAndGetThrottleTimeMs(request, numBytesAppended, time.milliseconds())
    val requestThrottleTimeMs = if (produceRequest.acks == 0) 0 else quotas.request.maybeRecordAndGetThrottleTimeMs(request)
    val maxThrottleTimeMs = Math.max(bandwidthThrottleTimeMs, requestThrottleTimeMs)
    if (maxThrottleTimeMs > 0) {
      if (bandwidthThrottleTimeMs > requestThrottleTimeMs) {
        quotas.produce.throttle(request, bandwidthThrottleTimeMs, sendResponse)
      } else {
        quotas.request.throttle(request, requestThrottleTimeMs, sendResponse)
      }
    }

    // Send the response immediately. In case of throttling, the channel has already been muted.
    //ack 为0 无需等待response 记录log 关闭连接
    if (produceRequest.acks == 0) {
      // no operation needed if producer request.required.acks = 0; however, if there is any error in handling
      // the request, since no response is expected by the producer, the server will close socket server so that
      // the producer client will know that some error has happened and will refresh its metadata
      if (errorInResponse) {
        val exceptionsSummary = mergedResponseStatus.map { case (topicPartition, status) =>
          topicPartition -> status.error.exceptionName
        }.mkString(", ")
        info(
          s"Closing connection due to error during produce request with correlation id ${request.header.correlationId} " +
            s"from client id ${request.header.clientId} with ack=0\n" +
            s"Topic and partition to exceptions: $exceptionsSummary"
        )
        closeConnection(request, new ProduceResponse(mergedResponseStatus.asJava).errorCounts)
      } else {
        // Note that although request throttling is exempt for acks == 0, the channel may be throttled due to
        // bandwidth quota violation.
        sendNoOpResponseExemptThrottle(request)
      }
    } else {
      sendResponse(request, Some(new ProduceResponse(mergedResponseStatus.asJava, maxThrottleTimeMs)), None)
    }
  }

  def processingStatsCallback(processingStats: FetchResponseStats): Unit = {
    processingStats.foreach { case (tp, info) =>
      updateRecordConversionStats(request, tp, info)
    }
  }

  if (authorizedRequestInfo.isEmpty)
    sendResponseCallback(Map.empty)
  else {
    val internalTopicsAllowed = request.header.clientId == AdminUtils.AdminClientId

    // call the replica manager to append messages to the replicas
    // ReplicaManager 持久化 LogSegment append then update offset
    replicaManager.appendRecords(
      timeout = produceRequest.timeout.toLong,
      requiredAcks = produceRequest.acks,
      internalTopicsAllowed = internalTopicsAllowed,
      isFromClient = true,
      entriesPerPartition = authorizedRequestInfo,
      responseCallback = sendResponseCallback,
      recordConversionStatsCallback = processingStatsCallback)

    // if the request is put into the purgatory, it will have a held reference and hence cannot be garbage collected;
    // hence we clear its data here in order to let GC reclaim its memory since it is already appended to log 清空内存 partition records to GC
    produceRequest.clearPartitionRecords()
  }
}

Kafka recive message 持久化流程如下

接收ProducerRequest 做一些权限认证过滤
通过 ReplicaManager(leader) 数据持久化到Log
Update End log offset

kafka 的compression 压缩如果producer 设置的压缩与topic config 不一致会导致broker 额外开销一次解压缩操作默认broker 以producer compress type 所以一般使用相同解压缩这样可以达到zero copy 更高的吞吐

Kafka Consumer poll Message 流程

broker 接收数据处理流程

Kafka Producer Send Message​

Kafka Broker Recive Message( hanle ProducerRequest)​

Kafka Consumer poll Message 流程​

Kafka Producer Send Message

Kafka Broker Recive Message( hanle ProducerRequest)

Kafka Consumer poll Message 流程