Skip to content

Commit

Permalink
moving functionality to PulsarHelper
Browse files Browse the repository at this point in the history
  • Loading branch information
ericm-db committed Aug 15, 2023
1 parent 25c39eb commit e7e87b6
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 70 deletions.
66 changes: 66 additions & 0 deletions src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,22 @@ import scala.collection.mutable
import scala.language.postfixOps
import scala.util.control.NonFatal

import org.apache.pulsar.client.admin.PulsarAdmin
import org.apache.pulsar.client.api.{MessageId, PulsarClient}
import org.apache.pulsar.client.impl.{MessageIdImpl, PulsarClientImpl}
import org.apache.pulsar.client.impl.schema.BytesSchema
import org.apache.pulsar.client.internal.DefaultImplementation
import org.apache.pulsar.common.api.proto.CommandGetTopicsOfNamespace
import org.apache.pulsar.common.naming.TopicName
import org.apache.pulsar.common.schema.SchemaInfo
import org.apache.pulsar.shade.com.google.common.util.concurrent.Uninterruptibles

import org.apache.spark.internal.Logging
import org.apache.spark.sql.connector.read.streaming
import org.apache.spark.sql.connector.read.streaming.{ReadAllAvailable, ReadLimit}
import org.apache.spark.sql.pulsar.PulsarOptions._
import org.apache.spark.sql.pulsar.PulsarSourceUtils.{getEntryId, getLedgerId}
import org.apache.spark.sql.pulsar.SpecificPulsarOffset.getTopicOffsets
import org.apache.spark.sql.types.StructType

/**
Expand All @@ -40,6 +46,7 @@ import org.apache.spark.sql.types.StructType
*/
private[pulsar] case class PulsarHelper(
serviceUrl: String,
adminUrl: String,
clientConf: ju.Map[String, Object],
driverGroupIdPrefix: String,
caseInsensitiveParameters: Map[String, String],
Expand All @@ -55,6 +62,8 @@ private[pulsar] case class PulsarHelper(
private var topics: Seq[String] = _
private var topicPartitions: Seq[String] = _

private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()

override def close(): Unit = {
// do nothing
}
Expand Down Expand Up @@ -207,6 +216,63 @@ private[pulsar] case class PulsarHelper(
}.toMap)
}

def latestOffsets(startingOffset: streaming.Offset,
admissionLimits: AdmissionLimits): SpecificPulsarOffset = {
// implement helper inside PulsarHelper in order to use getTopicPartitions
val latestOffsets = fetchLatestOffsets().topicOffsets
// add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit
// start a reader, get to the earliest offset for new topic partitions
val existingStartOffsets = if (startingOffset != null) {
getTopicOffsets(startingOffset.asInstanceOf[org.apache.spark.sql.execution.streaming.Offset])
} else {
Map[String, MessageId]()
}
val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet)
val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition
=> topicPartition -> fetchLatestOffsetForTopic(topicPartition))
val totalReadLimit = admissionLimits.bytesToTake
val offsets = mutable.Map[String, MessageId]()
offsets ++= startPartitionOffsets
val numPartitions = startPartitionOffsets.size
startPartitionOffsets.keys.filter(topicPartition => {
pulsarAdmin.topics.getInternalStats(topicPartition).currentLedgerEntries > 0
}).foreach { topicPartition =>
var readLimit = totalReadLimit / numPartitions
val messageId = startPartitionOffsets.apply(topicPartition)
val ledgerId = getLedgerId(messageId)
val entryId = getEntryId(messageId)
val stats = pulsarAdmin.topics.getInternalStats(topicPartition)
pulsarAdmin.topics.getInternalStats(topicPartition).ledgers.
asScala.filter(_.ledgerId >= ledgerId).sortBy(_.ledgerId).foreach { ledger =>
ledger.entries = stats.currentLedgerEntries
val avgBytesPerEntries = stats.currentLedgerSize / stats.currentLedgerEntries
// approximation of bytes left in ledger to deal with case
// where we are at the middle of the ledger
val bytesLeftInLedger = avgBytesPerEntries * {
if (ledger.ledgerId == ledgerId) {
ledger.entries - entryId
} else {
ledger.entries
}
}
if (readLimit > bytesLeftInLedger) {
readLimit -= bytesLeftInLedger
offsets += (topicPartition -> DefaultImplementation
.getDefaultImplementation
.newMessageId(ledger.ledgerId, ledger.entries - 1, -1))
} else {
val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries)
val lastEntryRead = Math.min(ledger.entries - 1, entryId + numEntriesToRead)
offsets += (topicPartition -> DefaultImplementation
.getDefaultImplementation
.newMessageId(ledger.ledgerId, lastEntryRead, -1))
readLimit = 0
}
}
}
SpecificPulsarOffset(offsets.toMap)
}

def fetchLatestOffsetForTopic(topic: String): MessageId = {
val messageId =
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,13 @@ private[pulsar] class PulsarProvider
parameters: Map[String, String]): (String, StructType) = {

val caseInsensitiveParams = validateStreamOptions(parameters)
val (clientConfig, _, serviceUrlConfig, _) = prepareConfForReader(parameters)
val (clientConfig, _, serviceUrlConfig, adminUrl) = prepareConfForReader(parameters)

val subscriptionNamePrefix = s"spark-pulsar-${UUID.randomUUID}"
val inferredSchema = Utils.tryWithResource(
PulsarHelper(
serviceUrlConfig,
adminUrl,
clientConfig,
subscriptionNamePrefix,
caseInsensitiveParams,
Expand Down Expand Up @@ -91,6 +92,7 @@ private[pulsar] class PulsarProvider
val subscriptionNamePrefix = getSubscriptionPrefix(parameters)
val pulsarHelper = PulsarHelper(
serviceUrl,
adminUrl,
clientConfig,
subscriptionNamePrefix,
caseInsensitiveParams,
Expand Down Expand Up @@ -127,10 +129,11 @@ private[pulsar] class PulsarProvider

val subscriptionNamePrefix = getSubscriptionPrefix(parameters, isBatch = true)

val (clientConfig, readerConfig, serviceUrl, _) = prepareConfForReader(parameters)
val (clientConfig, readerConfig, serviceUrl, adminUrl) = prepareConfForReader(parameters)
val (start, end, schema, pSchema) = Utils.tryWithResource(
PulsarHelper(
serviceUrl,
adminUrl,
clientConfig,
subscriptionNamePrefix,
caseInsensitiveParams,
Expand Down
82 changes: 14 additions & 68 deletions src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala
Original file line number Diff line number Diff line change
Expand Up @@ -68,89 +68,25 @@ private[pulsar] class PulsarSource(

private var currentTopicOffsets: Option[Map[String, MessageId]] = None

private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(serviceUrl).build()

private lazy val pulsarSchema: SchemaInfo = pulsarHelper.getPulsarSchema

override def schema(): StructType = SchemaUtils.pulsarSourceSchema(pulsarSchema)

override def getOffset: Option[Offset] = {
// Make sure initialTopicOffsets is initialized
initialTopicOffsets
val latest = pulsarHelper.fetchLatestOffsets()
currentTopicOffsets = Some(latest.topicOffsets)
Some(latest.asInstanceOf[Offset])
throw new UnsupportedOperationException(
"latestOffset(Offset, ReadLimit) should be called instead of this method")
}

override def latestOffset(startingOffset: streaming.Offset,
readLimit: ReadLimit): streaming.Offset = {
initialTopicOffsets
// implement helper inside PulsarHelper in order to use getTopicPartitions
val latestOffsets = pulsarHelper.fetchLatestOffsets().topicOffsets
// add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit
// start a reader, get to the earliest offset for new topic partitions
val existingStartOffsets = if (startingOffset != null) {
getTopicOffsets(startingOffset.asInstanceOf[org.apache.spark.sql.execution.streaming.Offset])
} else {
Map[String, MessageId]()
}
val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet)
val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition
=> topicPartition -> pulsarHelper.fetchLatestOffsetForTopic(topicPartition))
val totalReadLimit = AdmissionLimits(readLimit).get.bytesToTake
val offsets = mutable.Map[String, MessageId]()
offsets ++= startPartitionOffsets
val numPartitions = startPartitionOffsets.size
startPartitionOffsets.keys.filter(topicPartition => {
pulsarAdmin.topics.getInternalStats(topicPartition).currentLedgerEntries > 0
}).foreach { topicPartition =>
var readLimit = totalReadLimit / numPartitions
val messageId = startPartitionOffsets.apply(topicPartition)
val ledgerId = getLedgerId(messageId)
val entryId = getEntryId(messageId)
val stats = pulsarAdmin.topics.getInternalStats(topicPartition)
pulsarAdmin.topics.getInternalStats(topicPartition).ledgers.
asScala.filter(_.ledgerId >= ledgerId).sortBy(_.ledgerId).foreach{ ledger =>
ledger.entries = stats.currentLedgerEntries
val avgBytesPerEntries = stats.currentLedgerSize / stats.currentLedgerEntries
// approximation of bytes left in ledger to deal with case
// where we are at the middle of the ledger
val bytesLeftInLedger = avgBytesPerEntries * {
if (ledger.ledgerId == ledgerId) {
ledger.entries - entryId
} else {
ledger.entries
}
}
if (readLimit > bytesLeftInLedger) {
readLimit -= bytesLeftInLedger
offsets += (topicPartition -> DefaultImplementation
.getDefaultImplementation
.newMessageId(ledger.ledgerId, ledger.entries - 1, -1))
} else {
val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries)
val lastEntryRead = Math.min(ledger.entries - 1, entryId + numEntriesToRead)
offsets += (topicPartition -> DefaultImplementation
.getDefaultImplementation
.newMessageId(ledger.ledgerId, lastEntryRead, -1))
readLimit = 0
}
}
}
SpecificPulsarOffset(offsets.toMap)
val admissionLimits = AdmissionLimits(readLimit)
pulsarHelper.latestOffsets(startingOffset, admissionLimits.get)
}
override def getDefaultReadLimit: ReadLimit = {
ReadMaxBytes.apply(maxBytesPerTrigger)
}
class AdmissionLimits(var bytesToTake: Long)

object AdmissionLimits {
def apply(limit: ReadLimit): Option[AdmissionLimits] = limit match {
case maxBytes: ReadMaxBytes => Some(new AdmissionLimits(maxBytes.maxBytes))
case _ : ReadAllAvailable => Some(new AdmissionLimits(Int.MaxValue))
}

}

override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
// Make sure initialTopicOffsets is initialized
Expand Down Expand Up @@ -257,3 +193,13 @@ private[pulsar] class PulsarSource(

/** A read limit that admits a soft-max of `maxBytes` per micro-batch. */
case class ReadMaxBytes(maxBytes: Long) extends ReadLimit

class AdmissionLimits(var bytesToTake: Long)

object AdmissionLimits {
def apply(limit: ReadLimit): Option[AdmissionLimits] = limit match {
case maxBytes: ReadMaxBytes => Some(new AdmissionLimits(maxBytes.maxBytes))
case _: ReadAllAvailable => Some(new AdmissionLimits(Int.MaxValue))
}

}

0 comments on commit e7e87b6

Please sign in to comment.