ITPub博客

首页 > 大数据 > 数据分析 > spark direct kafka 将offset推到zookeeper

spark direct kafka 将offset推到zookeeper

原创 数据分析 作者:破棉袄 时间:2016-09-30 15:56:31 0 删除 编辑


1、spark streaming direct方式读取kafka性能要好很多,缺点是它不会去zookeeper更新offset,这将导致
    基于zk的KafkaOffsetMonitor监控失效。由于我们流处理的数据量大并且健壮性要求高,我们需要
     通过KafkaOffsetMonitor来实时监控数据消费情况,故自己实现offset推送到zk,如下:


2、spark代码:

  1. object KSConvertStreaming{

  2.   val savaLocal = "/xxx/parquet/%s/year=%s/month=%s/day=%s"

  3.   def jsonConvert(jsonStrs: Iterator[String]): Iterator[(String, ArrayBuffer[String])] = {
  4.     StreamingUtils.init
  5.     val typeMap = scala.collection.mutable.Map[String, ArrayBuffer[String]]()
  6.     jsonStrs.foreach(
  7.       x => {
  8.         val res = StreamingUtils.mapToStr(x)
  9.         if (null != res) {
  10.           val msgType = res.get(0)
  11.           if (!typeMap.contains(msgType)) {
  12.             typeMap += (msgType -> new ArrayBuffer[String]())
  13.           }
  14.           typeMap(msgType) += res.get(1)
  15.         }
  16.       }
  17.     )
  18.     typeMap.iterator
  19.   }

  20.   def main(args: Array[String]): Unit = {

  21.     val Array(maxPartition, maxNumber,windownsS,groupName,maxRatePerPartition) = args
  22.     val topicName = "xxx"
  23.     val kafkaAddr = "xxx:9092,xxx:9092,xxx:9092"
  24.     val kafkaParams = Map[String, String](
  25.       "metadata.broker.list" -> kafkaAddr,
  26.       "group.id" -> groupName,
  27.       "auto.offset.reset" -> "largest"
  28.     )
  29.     val topics = Set(topicName)

  30.     println(s"maxPartition -------- $maxPartition")
  31.     println(s"maxNumber -------- $maxNumber")
  32.     println(s"windownsS -------- $windownsS")
  33.     println(s"groupName -------- $groupName")

  34.     val sparkConf = new SparkConf().setAppName("Streaming_Convert")
  35.       .set("spark.yarn.executor.memoryOverhead","1024")
  36.       .set("spark.streaming.kafka.maxRatePerPartition",maxRatePerPartition)  //此处为每秒每个partition的条数
  37.       .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
  38.       .set("spark.reducer.maxSizeInFlight", "1m")
  39.     val sc = new SparkContext(sparkConf)
  40.     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
  41.     val ssc = new StreamingContext(sc, Seconds(windownsS.toInt)) //秒单位

  42.     val topicDirs = new ZKGroupTopicDirs(groupName,topicName)

  43.     val zkClient = new ZkClient("xxx:2181,xxx:2181,xxx:2181",Integer.MAX_VALUE,100000,ZKStringSerializer)
  44.     val children = zkClient.countChildren(topicDirs.consumerOffsetDir)

  45.     var kafkaStream: InputDStream[(String, String)] = null
  46.     var fromOffsets: Map[TopicAndPartition, Long] = Map()
  47.     val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
  48.     if (children > 0) {
  49.       for (i <- 0 until children) {
  50.         val partitionOffset = zkClient.readData[String](s"${topicDirs.consumerOffsetDir}/$i")
  51.         val tp = TopicAndPartition(topicName, i)
  52.         fromOffsets += (tp -> partitionOffset.toLong)
  53.       }
  54.       kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
  55.     }
  56.     else {
  57.       kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
  58.     }

  59.     var offsetRanges = Array[OffsetRange]()

  60.     kafkaStream.transform { rdd =>
  61.       offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
  62.       rdd
  63.     }.map(_._2).foreachRDD {
  64.       rdd =>
  65.         val xRDD = rdd.flatMap(_.split("\n")).mapPartitions(x => jsonConvert(x))
  66.         xRDD.persist(StorageLevel.MEMORY_ONLY)
  67.         val typeCountMap = xRDD.map(x => (x._1, x._2.size)).collect()
  68.         val typeMap = scala.collection.mutable.Map[String, Long]()
  69.         typeCountMap.foreach(x => {
  70.           if (!typeMap.contains(x._1)) {
  71.             typeMap(x._1) = x._2
  72.           } else {
  73.             val tmpCount = typeMap(x._1)
  74.             typeMap(x._1) = x._2 + tmpCount
  75.           }
  76.         })
  77.         var totalCount: Long = 0
  78.         typeMap.foreach(x => {
  79.           println(s"${x._1}:${x._2}")
  80.           totalCount += x._2
  81.         })
  82.         println(s"total : $totalCount" )
  83.         val sortedMap = collection.mutable.LinkedHashMap(typeMap.toSeq.sortWith(_._2 > _._2): _*)

  84.         sortedMap.foreach {
  85.           x => {
  86.             val pointType = x._1
  87.             val count = x._2
  88.             println(s"save type( $pointType ) count( $count )")
  89.             val jsonRDD = xRDD.filter(x => x._1 == pointType).
  90.               flatMap(x => x._2)
  91.             var partitionNum = count / maxNumber.toLong
  92.             if (partitionNum == 0) {
  93.               partitionNum = 1
  94.             }
  95.             if (partitionNum > maxPartition.toLong) {
  96.               partitionNum = maxPartition.toLong
  97.             }

  98.             println(s"\trepartition ( $partitionNum )")
  99.             val arrType = pointType.split('-')
  100.             sqlContext.read.json(jsonRDD).
  101.               repartition(partitionNum.toInt).
  102.               write.mode(SaveMode.Append).
  103.               parquet(savaLocal.format(arrType(0),arrType(1),arrType(2),arrType(3)))

  104.           }
  105.         }
  106.         xRDD.unpersist()

  107.         for (o <- offsetRanges) {
  108.           val zkPath = s"${topicDirs.consumerOffsetDir}/${o.partition}"
  109.           ZkUtils.updatePersistentPath(zkClient, zkPath, o.untilOffset.toString)
  110.         }
  111.     }

  112.     ssc.start()
  113.     ssc.awaitTermination()
  114.   }


  115. }


来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29754888/viewspace-2125804/,如需转载,请注明出处,否则将追究法律责任。

请登录后发表评论 登录
全部评论

注册时间:2014-07-16

  • 博文量
    180
  • 访问量
    1111065