1. 程式人生 > >大數據入門第二十二天——spark(三)自定義分區、排序與查找

大數據入門第二十二天——spark(三)自定義分區、排序與查找

get buffer arr clas ron arm scala mut all

一、自定義分區

  1.概述

    默認的是Hash的分區策略,這點和Hadoop是類似的,具體的分區介紹,參見:https://blog.csdn.net/high2011/article/details/68491115

  2.實現

package cn.itcast.spark.day3

import java.net.URL
import org.apache.spark.{HashPartitioner, Partitioner, SparkConf, SparkContext}
import scala.collection.mutable

/**
  * Created by root on 2016/5/18.
  
*/ object UrlCountPartition { def main(args: Array[String]) { val conf = new SparkConf().setAppName("UrlCountPartition").setMaster("local[2]") val sc = new SparkContext(conf) //rdd1將數據切分,元組中放的是(URL, 1) val rdd1 = sc.textFile("c://itcast.log").map(line => { val f = line.split("\t") (f(
1), 1) }) val rdd2 = rdd1.reduceByKey(_ + _) val rdd3 = rdd2.map(t => { val url = t._1 val host = new URL(url).getHost (host, (url, t._2)) }) val ints = rdd3.map(_._1).distinct().collect() val hostParitioner = new HostParitioner(ints) // val rdd4 = rdd3.partitionBy(new HashPartitioner(ints.length))
val rdd4 = rdd3.partitionBy(hostParitioner).mapPartitions(it => { it.toList.sortBy(_._2._2).reverse.take(2).iterator }) rdd4.saveAsTextFile("c://out4") //println(rdd4.collect().toBuffer) sc.stop() } } /** * 決定了數據到哪個分區裏面 * @param ins */ class HostParitioner(ins: Array[String]) extends Partitioner { val parMap = new mutable.HashMap[String, Int]() var count = 0 for(i <- ins){ parMap += (i -> count) count += 1 } override def numPartitions: Int = ins.length override def getPartition(key: Any): Int = { parMap.getOrElse(key.toString, 0) } }

  // 與Hadoop相通,不再贅述

二、自定義排序

  基本上就是結合之前的隱式轉換了:(這裏使用樣例類可以不用new就能得到實例,另外也可以用於模式匹配)

package cn.itcast.spark.day3

import org.apache.spark.{SparkConf, SparkContext}


object OrderContext {
  implicit val girlOrdering  = new Ordering[Girl] {
    override def compare(x: Girl, y: Girl): Int = {
      if(x.faceValue > y.faceValue) 1
      else if (x.faceValue == y.faceValue) {
        if(x.age > y.age) -1 else 1
      } else -1
    }
  }
}


/**
  * Created by root on 2016/5/18.
  */
//sort =>規則 先按faveValue,比較年齡
//name,faveValue,age


object CustomSort {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("CustomSort").setMaster("local[2]")
    val sc = new SparkContext(conf)
    val rdd1 = sc.parallelize(List(("yuihatano", 90, 28, 1), ("angelababy", 90, 27, 2),("JuJingYi", 95, 22, 3)))
    import OrderContext._
    val rdd2 = rdd1.sortBy(x => Girl(x._2, x._3), false)
    println(rdd2.collect().toBuffer)
    sc.stop()
  }

}

/**
  * 第一種方式
  * @param faceValue
  * @param age

case class Girl(val faceValue: Int, val age: Int) extends Ordered[Girl] with Serializable {
  override def compare(that: Girl): Int = {
    if(this.faceValue == that.faceValue) {
      that.age - this.age
    } else {
      this.faceValue -that.faceValue
    }
  }
}
  */

/**
  * 第二種,通過隱式轉換完成排序
  * @param faceValue
  * @param age
  */
case class Girl(faceValue: Int, age: Int) extends Serializable

大數據入門第二十二天——spark(三)自定義分區、排序與查找