大數據入門第二十二天——spark(三)自定義分區、排序與查找
阿新 • • 發佈:2018-04-03
get buffer arr clas ron arm scala mut all
一、自定義分區
1.概述
默認的是Hash的分區策略,這點和Hadoop是類似的,具體的分區介紹,參見:https://blog.csdn.net/high2011/article/details/68491115
2.實現
package cn.itcast.spark.day3
import java.net.URL
import org.apache.spark.{HashPartitioner, Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
/**
* Created by root on 2016/5/18.
*/
object UrlCountPartition {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("UrlCountPartition").setMaster("local[2]")
val sc = new SparkContext(conf)
//rdd1將數據切分,元組中放的是(URL, 1)
val rdd1 = sc.textFile("c://itcast.log").map(line => {
val f = line.split("\t")
(f( 1), 1)
})
val rdd2 = rdd1.reduceByKey(_ + _)
val rdd3 = rdd2.map(t => {
val url = t._1
val host = new URL(url).getHost
(host, (url, t._2))
})
val ints = rdd3.map(_._1).distinct().collect()
val hostParitioner = new HostParitioner(ints)
// val rdd4 = rdd3.partitionBy(new HashPartitioner(ints.length))
val rdd4 = rdd3.partitionBy(hostParitioner).mapPartitions(it => {
it.toList.sortBy(_._2._2).reverse.take(2).iterator
})
rdd4.saveAsTextFile("c://out4")
//println(rdd4.collect().toBuffer)
sc.stop()
}
}
/**
* 決定了數據到哪個分區裏面
* @param ins
*/
class HostParitioner(ins: Array[String]) extends Partitioner {
val parMap = new mutable.HashMap[String, Int]()
var count = 0
for(i <- ins){
parMap += (i -> count)
count += 1
}
override def numPartitions: Int = ins.length
override def getPartition(key: Any): Int = {
parMap.getOrElse(key.toString, 0)
}
}
// 與Hadoop相通,不再贅述
二、自定義排序
基本上就是結合之前的隱式轉換了:(這裏使用樣例類可以不用new就能得到實例,另外也可以用於模式匹配)
package cn.itcast.spark.day3
import org.apache.spark.{SparkConf, SparkContext}
object OrderContext {
implicit val girlOrdering = new Ordering[Girl] {
override def compare(x: Girl, y: Girl): Int = {
if(x.faceValue > y.faceValue) 1
else if (x.faceValue == y.faceValue) {
if(x.age > y.age) -1 else 1
} else -1
}
}
}
/**
* Created by root on 2016/5/18.
*/
//sort =>規則 先按faveValue,比較年齡
//name,faveValue,age
object CustomSort {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("CustomSort").setMaster("local[2]")
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(List(("yuihatano", 90, 28, 1), ("angelababy", 90, 27, 2),("JuJingYi", 95, 22, 3)))
import OrderContext._
val rdd2 = rdd1.sortBy(x => Girl(x._2, x._3), false)
println(rdd2.collect().toBuffer)
sc.stop()
}
}
/**
* 第一種方式
* @param faceValue
* @param age
case class Girl(val faceValue: Int, val age: Int) extends Ordered[Girl] with Serializable {
override def compare(that: Girl): Int = {
if(this.faceValue == that.faceValue) {
that.age - this.age
} else {
this.faceValue -that.faceValue
}
}
}
*/
/**
* 第二種,通過隱式轉換完成排序
* @param faceValue
* @param age
*/
case class Girl(faceValue: Int, age: Int) extends Serializable
大數據入門第二十二天——spark(三)自定義分區、排序與查找