ITPub博客

首页 > 大数据 > 数据分析 > spark 与flume 1.6.0

spark 与flume 1.6.0

原创 数据分析 作者:hgs19921112 时间:2018-10-18 21:09:35 0 删除 编辑
package hgs.spark.streaming
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.flume.FlumeUtils
import org.apache.spark.storage.StorageLevel
import org.apache.spark.HashPartitioner
/*	pom.xml中加入如下配置
 * 	<dependency>
 
    		<groupId>org.apache.spark</groupId>
    		<artifactId>spark-streaming-flume_2.11</artifactId>
   			<version>2.1.0</version>
		</dependency>
		* 
		*/
/*flume的conf文件
a1.sources=r1
a1.sinks=k1
a1.channels=c1
a1.sources.r1.type=spooldir
a1.sources.r1.spoolDir=/home/logs
a1.sources.r1.fileHeader=true
a1.sinks.k1.type=avro
a1.sinks.k1.hostname= 192.168.1.9
a1.sinks.k1.port= 8888
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1
#the command to start a agent
#bin/flume-ng agent -n $agent_name -c conf -f conf/flume-conf.properties.template
*/
object SparkStreamingFlumePush {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("flume-push").setMaster("local[2]");
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc,Seconds(5))
    ssc.checkpoint("d:\\checkpoint")
    val updateFunc=(iter:Iterator[(String,Seq[Int],Option[Int])])=>{
    //iter.flatMap(it=>Some(it._2.sum+it._3.getOrElse(0)).map((it._1,_)))//方式一
    //iter.flatMap{case(x,y,z)=>{Some(y.sum+z.getOrElse(0)).map((x,_))}}//方式二
    iter.flatMap(it=>Some(it._1,(it._2.sum.toInt+it._3.getOrElse(0))))//方式三
    }
    //总共有两种获取数据的方式,push和poll,这种是push即flume将数据推送给spark 该出的ip、port是spark的ip地址和port
    val rds = FlumeUtils.createStream(ssc, "192.168.1.9", 8888, StorageLevel.MEMORY_ONLY)
    val result = rds.flatMap(x=>(new String(x.event.getBody.array())).split(" "))
    .map(x=>(x,1))
    .updateStateByKey(updateFunc, new HashPartitioner(sc.defaultMinPartitions), true)
    
    result.print()
    
    ssc.start()
    ssc.awaitTermination()
    
    
  }
}
package hgs.spark.streaming
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.flume.FlumeUtils
import java.net.InetAddress
import java.net.InetSocketAddress
import org.apache.spark.storage.StorageLevel
import org.apache.spark.HashPartitioner
//spark支持1.6.0的flume版本
/*	pom.xml中加入如下配置
 * 	<dependency>
 
    		<groupId>org.apache.spark</groupId>
    		<artifactId>spark-streaming-flume_2.11</artifactId>
   			<version>2.1.0</version>
		</dependency>
		* 
		*/
/*
 * flume配置
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type=spooldir
a1.sources.r1.spoolDir = /home/logs
a1.sources.r1.fileHeader = true
a1.sinks.k1.type=org.apache.spark.streaming.flume.sink.SparkSink
a1.sinks.k1.hostname=192.168.6.129
a1.sinks.k1.port = 8888
a1.channels.c1.type=memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity=100
a1.sources.r1.channels=c1
a1.sinks.k1.channel = c1
#the command to start a agent
#bin/flume-ng agent -n $agent_name -c conf -f conf/flume-conf.properties.template
*/
//同时需要如下三个包 将三个包放到flume的classpath下面
/* groupId = org.apache.spark
 artifactId = spark-streaming-flume-sink_2.11
 version = 2.1.0
 
 groupId = org.scala-lang
 artifactId = scala-library
 version = 2.11.7
 
 groupId = org.apache.commons
 artifactId = commons-lang3
 version = 3.5*/
 
object SparkStreamingFlumePoll {
  def main(args: Array[String]): Unit = {
   val conf = new SparkConf().setAppName("flume-push").setMaster("local[2]");
   val sc = new SparkContext(conf)
   val ssc = new StreamingContext(sc,Seconds(5))
   ssc.checkpoint("d:\\checkpoint")
   val ipSeq =  Seq(new InetSocketAddress("192.168.6.129",8888))
   //这种方式通过spark从flume拉取数据
   val rds = FlumeUtils.createPollingStream(ssc, ipSeq, StorageLevel.MEMORY_AND_DISK)
   
   val updateFunc=(iter:Iterator[(String,Seq[Int],Option[Int])])=>{
    //iter.flatMap(it=>Some(it._2.sum+it._3.getOrElse(0)).map((it._1,_)))//方式一
    //iter.flatMap{case(x,y,z)=>{Some(y.sum+z.getOrElse(0)).map((x,_))}}//方式二
    iter.flatMap(it=>Some(it._1,(it._2.sum.toInt+it._3.getOrElse(0))))//方式三
    }
   val result = rds.flatMap(x=>(new String(x.event.getBody.array())).split(" "))
    .map(x=>(x,1))
    .updateStateByKey(updateFunc, new HashPartitioner(sc.defaultMinPartitions), true)
    
    result.print()
    
    ssc.start()
    ssc.awaitTermination()
  }
}
//遇到的错误 scala-library包在flume 的lib下面本来就有,包重复导致的冲突,删除一个
/*18 Oct 2018 20:58:32,123 WARN  [Spark Sink Processor Thread - 10] (org.apache.spark.streaming.flume.sink.Logging$class.logWarning:80)  - Error while processing transaction.
java.lang.IllegalStateException: begin() called when transaction is OPEN!
	at com.google.common.base.Preconditions.checkState(Preconditions.java:145)
	at org.apache.flume.channel.BasicTransactionSemantics.begin(BasicTransactionSemantics.java:131)
	at org.apache.spark.streaming.flume.sink.TransactionProcessor$$anonfun$populateEvents$1.apply(TransactionProcessor.scala:114)
	at org.apache.spark.streaming.flume.sink.TransactionProcessor$$anonfun$populateEvents$1.apply(TransactionProcessor.scala:113)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.streaming.flume.sink.TransactionProcessor.populateEvents(TransactionProcessor.scala:113)
	at org.apache.spark.streaming.flume.sink.TransactionProcessor.call(TransactionProcessor.scala:243)
	at org.apache.spark.streaming.flume.sink.TransactionProcessor.call(TransactionProcessor.scala:43)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
18 Oct 2018 20:58:32,128 WARN  [Spark Sink Processor Thread - 10] (org.apache.spark.streaming.flume.sink.Logging$class.logWarning:59)  - Spark was unable to successfully process the events. Transaction is being rolled back.
18 Oct 2018 20:58:32,128 WARN  [New I/O  worker #1] (org.apache.spark.streaming.flume.sink.Logging$class.logWarning:59)  - Received an error batch - no events were received from channel! */


来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/31506529/viewspace-2216840/,如需转载,请注明出处,否则将追究法律责任。

请登录后发表评论 登录
全部评论

注册时间:2017-11-22

  • 博文量
    99
  • 访问量
    99307