SlideShare a Scribd company logo
Apache Spark for 

library developers
William Benton
willb@redhat.com
@willb
Erik Erlandson
eje@redhat.com
@manyangled
About Will
#SAISDD6
The Silex and Isarn libraries
Reusable open-source code that works 

with Spark, factored from internal apps.
We’ve tracked Spark releases since Spark 1.3.0.
See https://silex.radanalytics.io and 

http://isarnproject.org
Apache Spark for Library Developers with Erik Erlandson and William Benton
#SAISDD6
Forecast
Basic considerations for reusable Spark code
Generic functions for parallel collections
Extending data frames with custom aggregates
Exposing JVM libraries to Python
Sharing your work with the world
Basic considerations
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
Today’s main themes
#SAISDD6
in your SBT build definition:
Cross-building for Scala
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your shell:
$ sbt +compile
$ sbt "++ 2.11.11" compile
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
#SAISDD6
in your SBT build definition:
Cross-building for Scala
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your shell:
$ sbt +compile
$ sbt "++ 2.11.11" compile
$ sbt +compile # or test, package, publish, etc.
$ sbt "++ 2.11.11" compile
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
#SAISDD6
in your SBT build definition:
Cross-building for Scala
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your shell:
$ sbt +compile
$ sbt "++ 2.11.11" compile
$ sbt +compile # or test, package, publish, etc.
$ sbt "++ 2.11.11" compile
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
#SAISDD6
in your SBT build definition:
Bring-your-own Spark
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
#SAISDD6
in your SBT build definition:
Bring-your-own Spark
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
#SAISDD6
in your SBT build definition:
“Bring-your-own Spark”
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
#SAISDD6
in your SBT build definition:
“Bring-your-own Spark”
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
#SAISDD6
Taking care with resources
#SAISDD6
Taking care with resources
#SAISDD6
Taking care with resources
#SAISDD6
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
rdd.cache()
#SAISDD6
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
rdd.cache()
#SAISDD6
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
rdd.cache()
rdd.unpersist()
#SAISDD6
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
result
}
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
#SAISDD6
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
#SAISDD6
nextModel = modelFromState(newState)
current.unpersist
}
var nextModel = initialModel
for (int i = 0; i < iterations; i++) {
val current = sc.broadcast(nextModel)
val newState =
current.unpersist
sc.broadcast(nextModel)
#SAISDD6
nextModel = modelFromState(newState)
current.unpersist
}
var nextModel = initialModel
for (int i = 0; i < iterations; i++) {
val current = sc.broadcast(nextModel)
val newState =
current.unpersist
sc.broadcast(nextModel)
#SAISDD6
nextModel = modelFromState(newState)
current.unpersist
}
var nextModel = initialModel
for (int i = 0; i < iterations; i++) {
val current = sc.broadcast(nextModel)
val newState =
current.unpersist
sc.broadcast(nextModel)
#SAISDD6
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
#SAISDD6
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
class
pointer flags size locks element pointer element pointer
class
pointer flags size locks 1.0
class
pointer flags size locks 3.0 4.0
2.0
#SAISDD6
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
class
pointer flags size locks element pointer element pointer
class
pointer flags size locks 1.0
class
pointer flags size locks 3.0 4.0
2.0 32 bytes of data…
#SAISDD6
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
class
pointer flags size locks element pointer element pointer
class
pointer flags size locks 1.0
class
pointer flags size locks 3.0 4.0
2.0
…and 64 bytes
of overhead!
32 bytes of data…
Continuous integration for
Spark libraries and apps
#SAISDD6
local[*]
#SAISDD6
CPU Memory
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
local[2]
#SAISDD6
#SAISDD6
Writing generic code for
Spark’s parallel collections
#SAISDD6
The RDD is invariant
T <: U RDD[T] <: RDD[U]
#SAISDD6
The RDD is invariant
T <: U RDD[T] <: RDD[U]
dog animal
#SAISDD6
T <: U RDD[T] <: RDD[U]
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
#SAISDD6
T <: U RDD[T] <: RDD[U]
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
#SAISDD6
val xacts = spark.parallelize(Array(
Transaction(1, 1, 1.0),
Transaction(2, 2, 1.0)
))
badKeyByUserId(xacts)
<console>: error: type mismatch;
found : org.apache.spark.rdd.RDD[Transaction]
required: org.apache.spark.rdd.RDD[HasUserId]
Note: Transaction <: HasUserID, but class RDD is invariant in type T.
You may wish to define T as +T instead. (SLS 4.5)
badKeyByUserId(xacts)
#SAISDD6
val xacts = spark.parallelize(Array(
Transaction(1, 1, 1.0),
Transaction(2, 2, 1.0)
))
badKeyByUserId(xacts)
<console>: error: type mismatch;
found : org.apache.spark.rdd.RDD[Transaction]
required: org.apache.spark.rdd.RDD[HasUserId]
Note: Transaction <: HasUserID, but class RDD is invariant in type T.
You may wish to define T as +T instead. (SLS 4.5)
badKeyByUserId(xacts)
#SAISDD6
#SAISDD6
An example: natural join
A B C D E A EB X Y
#SAISDD6
An example: natural join
A B C D E A EB X Y
#SAISDD6
An example: natural join
A B C D E X Y
#SAISDD6
Ad-hoc natural join
df1.join(df2, df1("a") === df2("a") &&
df1("b") === df2("b") &&
df1("e") === df2("e"))
#SAISDD6
= {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
introspecting over column names
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
dynamically constructing expressions
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
dynamically constructing expressions
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
dynamically constructing expressions
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
[left.a === right.a, left.b === right.b, …]
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
left.a === right.a && left.b === right.b && …
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
left.a === right.a && left.b === right.b && …
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
dynamically constructing column lists
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
dynamically constructing column lists
#SAISDD6
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
}
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
}
}
import NaturalJoin.implicits._
df.natjoin(otherdf)
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
}
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
}
}
import NaturalJoin.implicits._
df.natjoin(otherdf)
#SAISDD6
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
}
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
}
}
import NaturalJoin.implicits._
df.natjoin(otherdf)
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
}
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
}
}
import NaturalJoin.implicits._
df.natjoin(otherdf)
#SAISDD6
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
}
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
}
}
import NaturalJoin.implicits._
df.natjoin(otherdf)
#SAISDD6
User-defined functions
{"a": 1, "b": "wilma", ..., "x": "club"}
{"a": 2, "b": "betty", ..., "x": "diamond"}
{"a": 3, "b": "fred", ..., "x": "heart"}
{"a": 4, "b": "barney", ..., "x": "spade"}
#SAISDD6
User-defined functions
{"a": 1, "b": "wilma", ..., "x": "club"}
{"a": 2, "b": "betty", ..., "x": "diamond"}
{"a": 3, "b": "fred", ..., "x": "heart"}
{"a": 4, "b": "barney", ..., "x": "spade"}
wilma club
betty diamond
fred heart
barney spade
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
extract_bx = selectively_structure(["b", "x"])
structured_df = df.withColumn("result", extract_bx("json"))
#SAISDD6
Spark’s ML pipelines
model.transform(df)
#SAISDD6
Spark’s ML pipelines
model.transform(df)
#SAISDD6
Spark’s ML pipelines
estimator.fit(df)
#SAISDD6
Spark’s ML pipelines
estimator.fit(df) model.transform(df)
#SAISDD6
Working with ML pipelines
model.transform(df)
#SAISDD6
Working with ML pipelines
model.transform(df)
#SAISDD6
Spark’s ML pipelines
#SAISDD6
Spark’s ML pipelines
model.transform(df)
#SAISDD6
Spark’s ML pipelines
estimator.fit(df) model.transform(df)
#SAISDD6
Spark’s ML pipelines
estimator.fit(df) model.transform(df)
inputCol
epochs
seed
outputCol
#SAISDD6
#SAISDD6
Forecast
Basic considerations for reusable Spark code
Generic functions for parallel collections
Extending data frames with custom aggregates
Exposing JVM libraries to Python
Sharing your work with the world
About Erik
User-defined aggregates:
the fundamentals
#SAISDD6
Three components
#SAISDD6
Three components
#SAISDD6
Three components
#SAISDD6
Three components
#SAISDD6
Three components
User-defined aggregates:
the implementation
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
Four main functions: initialize
initialize
#SAISDD6
Four main functions: initialize
initialize
#SAISDD6
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
#SAISDD6
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
#SAISDD6
Four main functions: evaluate
evaluate
#SAISDD6
Four main functions: evaluate
evaluate
#SAISDD6
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
#SAISDD6
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
#SAISDD6
Four main functions: update
update
#SAISDD6
Four main functions: update
update
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
Four main functions: merge
1
merge
2
#SAISDD6
Four main functions: merge
merge
1 + 2
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
User-defined aggregates:
User-defined types
#SAISDD6
User-defined types
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
package org.apache.spark
#SAISDD6
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
User-defined types
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
#SAISDD6
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
User-defined types
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
#SAISDD6
Implementing custom types
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
#SAISDD6
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
#SAISDD6
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
#SAISDD6
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
#SAISDD6
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
#SAISDD6
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
#SAISDD6
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
#SAISDD6
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
#SAISDD6
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
#SAISDD6
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
#SAISDD6
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
Extending PySpark with
your Scala library
#SAISDD6
[ ]
#SAISDD6
[ ]
#SAISDD6
[ ]
#SAISDD6
[ ]
#SAISDD6
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing = sparkJVM.com.path.to.this.thing
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing = sparkJVM.com.path.to.this.thing
#SAISDD6
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing = sparkJVM.com.path.to.this.thing
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
#SAISDD6
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing = sparkJVM.com.path.to.this.thing
# use the gateway to access JVM objects and classes
thisThing = sparkJVM.com.path.to.this.thing
#SAISDD6
A Python-friendly wrapper
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
}
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
}
#SAISDD6
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
}
tdigestDoubleUDAF
#SAISDD6
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
}
Double
#SAISDD6
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
#SAISDD6
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
tdapply apply
#SAISDD6
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))tdapply(_to_seq(sc, [col], _to_java_column))
#SAISDD6
class TDigestUDT(UserDefinedType):
@classmethod
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
class TDigestUDT(UserDefinedType):
@classmethod
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
# ...
#SAISDD6
class TDigestUDT(UserDefinedType):
@classmethod
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
class TDigestUDT(UserDefinedType):
@classmethod
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
# ...
#SAISDD6
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
#SAISDD6
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
#SAISDD6
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
#SAISDD6
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (obj.delta, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (obj.delta, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
#SAISDD6
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (obj.delta, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (obj.delta, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
#SAISDD6
class TDigestUDT extends UserDefinedType[TDigestSQL] {
// ...
override def pyUDT: String =
“isarnproject.sketches.udt.tdigest.TDigestUDT"
}
class TDigestUDT extends UserDefinedType[TDigestSQL] {
// ...
override def pyUDT: String =
“isarnproject.sketches.udt.tdigest.TDigestUDT"
}
#SAISDD6
Python code in JAR files
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
#SAISDD6
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
#SAISDD6
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
#SAISDD6
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
#SAISDD6
Cross-building for Python
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
#SAISDD6
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
#SAISDD6
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
#SAISDD6
Using versioned JAR files
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
#SAISDD6
Using versioned JAR files
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
#SAISDD6
Using versioned JAR files
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
Show your work: 

publishing results
#SAISDD6
Developing with git-flow
$ brew install git-flow # macOS
$ dnf install git-flow # Fedora
$ yum install git-flow # CentOS
$ apt-get install git-flow # Debian and friends
(Search the internet for “git flow” to learn more!)
#SAISDD6
# Set up git-flow in this repository
$ git flow init
# Start work on my-awesome-feature; create
# and switch to a feature branch
$ git flow feature start my-awesome-feature
$ ...
# Finish work on my-awesome-feature; merge
# feature/my-awesome-feature to develop
$ git flow feature finish my-awesome-feature
#SAISDD6
# Start work on a release branch
$ git flow release start 0.1.0
# Hack and bump version numbers
$ ...
# Finish work on v0.1.0; merge
# release/0.1.0 to develop and master;
# tag v0.1.0
$ git flow release finish 0.1.0
Apache Spark for Library Developers with Erik Erlandson and William Benton
Apache Spark for Library Developers with Erik Erlandson and William Benton
Apache Spark for Library Developers with Erik Erlandson and William Benton
Apache Spark for Library Developers with Erik Erlandson and William Benton
#SAISDD6
Maven Central Bintray
not really easy to set up for library developers trivial
trivial easy to set up for library users mostly
yes, via sbt easy to publish yes, via sbt + plugins
yes easy to resolve artifacts mostly
Conclusions and takeaways
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
https://radanalytics.io
eje@redhat.com • @manyangled
willb@redhat.com • @willb
KEEP IN TOUCH

More Related Content

PDF
DataEngConf SF16 - Spark SQL Workshop
PDF
Zero to Streaming: Spark and Cassandra
PDF
Big data analytics with Spark & Cassandra
PDF
Apache Spark and DataStax Enablement
PDF
Analytics with Cassandra & Spark
PDF
Lightning fast analytics with Spark and Cassandra
PPTX
Analytics with Cassandra, Spark & MLLib - Cassandra Essentials Day
PDF
Distributed Search in Riak - Integrating Search in a NoSQL Database: Presente...
DataEngConf SF16 - Spark SQL Workshop
Zero to Streaming: Spark and Cassandra
Big data analytics with Spark & Cassandra
Apache Spark and DataStax Enablement
Analytics with Cassandra & Spark
Lightning fast analytics with Spark and Cassandra
Analytics with Cassandra, Spark & MLLib - Cassandra Essentials Day
Distributed Search in Riak - Integrating Search in a NoSQL Database: Presente...

What's hot (20)

PDF
Apache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLab
PDF
Spark Cassandra Connector Dataframes
PDF
Spark cassandra connector.API, Best Practices and Use-Cases
PDF
Spark and Cassandra 2 Fast 2 Furious
PDF
Cassandra and Spark: Optimizing for Data Locality
PDF
Spark Cassandra Connector: Past, Present, and Future
PDF
Riak at The NYC Cloud Computing Meetup Group
PPTX
Apache Spark RDD 101
PDF
Apache cassandra and spark. you got the the lighter, let's start the fire
PPTX
Django cryptography
PDF
Apache Spark with Scala
PDF
Escape From Hadoop: Spark One Liners for C* Ops
PDF
Introduction to Spark Datasets - Functional and relational together at last
PDF
Beyond shuffling - Scala Days Berlin 2016
PDF
Spark Streaming with Cassandra
PDF
Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)
PDF
A deeper-understanding-of-spark-internals
PDF
An Introduction to time series with Team Apache
PPTX
Getting started in Apache Spark and Flink (with Scala) - Part II
PDF
Spark cassandra integration, theory and practice
Apache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Spark Cassandra Connector Dataframes
Spark cassandra connector.API, Best Practices and Use-Cases
Spark and Cassandra 2 Fast 2 Furious
Cassandra and Spark: Optimizing for Data Locality
Spark Cassandra Connector: Past, Present, and Future
Riak at The NYC Cloud Computing Meetup Group
Apache Spark RDD 101
Apache cassandra and spark. you got the the lighter, let's start the fire
Django cryptography
Apache Spark with Scala
Escape From Hadoop: Spark One Liners for C* Ops
Introduction to Spark Datasets - Functional and relational together at last
Beyond shuffling - Scala Days Berlin 2016
Spark Streaming with Cassandra
Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)
A deeper-understanding-of-spark-internals
An Introduction to time series with Team Apache
Getting started in Apache Spark and Flink (with Scala) - Part II
Spark cassandra integration, theory and practice
Ad

Similar to Apache Spark for Library Developers with Erik Erlandson and William Benton (20)

PDF
Apache Spark for Library Developers with William Benton and Erik Erlandson
PDF
[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探
PPTX
PDF
Apache spark - Spark's distributed programming model
PPTX
Study Notes: Apache Spark
PDF
Secrets of Spark's success - Deenar Toraskar, Think Reactive
PPTX
Zaharia spark-scala-days-2012
PDF
IBM Spark Technology Center: Real-time Advanced Analytics and Machine Learnin...
PPT
Apache Spark™ is a multi-language engine for executing data-S5.ppt
PPTX
APACHE SPARK.pptx
PDF
Distributed computing with spark
PDF
Spark cluster computing with working sets
PPTX
Apache Spark
PPTX
Building highly scalable data pipelines with Apache Spark
PPTX
Big data processing with Apache Spark and Oracle Database
PPTX
Spark Overview and Performance Issues
PPTX
Mahout scala and spark bindings
PDF
Apache Spark in Depth: Core Concepts, Architecture & Internals
PPTX
Big Data processing with Spark, Scala or Java?
PDF
Apache Spark Presentation good for big data
Apache Spark for Library Developers with William Benton and Erik Erlandson
[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探
Apache spark - Spark's distributed programming model
Study Notes: Apache Spark
Secrets of Spark's success - Deenar Toraskar, Think Reactive
Zaharia spark-scala-days-2012
IBM Spark Technology Center: Real-time Advanced Analytics and Machine Learnin...
Apache Spark™ is a multi-language engine for executing data-S5.ppt
APACHE SPARK.pptx
Distributed computing with spark
Spark cluster computing with working sets
Apache Spark
Building highly scalable data pipelines with Apache Spark
Big data processing with Apache Spark and Oracle Database
Spark Overview and Performance Issues
Mahout scala and spark bindings
Apache Spark in Depth: Core Concepts, Architecture & Internals
Big Data processing with Spark, Scala or Java?
Apache Spark Presentation good for big data
Ad

More from Databricks (20)

PPTX
DW Migration Webinar-March 2022.pptx
PPTX
Data Lakehouse Symposium | Day 1 | Part 1
PPT
Data Lakehouse Symposium | Day 1 | Part 2
PPTX
Data Lakehouse Symposium | Day 2
PPTX
Data Lakehouse Symposium | Day 4
PDF
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
PDF
Democratizing Data Quality Through a Centralized Platform
PDF
Learn to Use Databricks for Data Science
PDF
Why APM Is Not the Same As ML Monitoring
PDF
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
PDF
Stage Level Scheduling Improving Big Data and AI Integration
PDF
Simplify Data Conversion from Spark to TensorFlow and PyTorch
PDF
Scaling your Data Pipelines with Apache Spark on Kubernetes
PDF
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
PDF
Sawtooth Windows for Feature Aggregations
PDF
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
PDF
Re-imagine Data Monitoring with whylogs and Spark
PDF
Raven: End-to-end Optimization of ML Prediction Queries
PDF
Processing Large Datasets for ADAS Applications using Apache Spark
PDF
Massive Data Processing in Adobe Using Delta Lake
DW Migration Webinar-March 2022.pptx
Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 4
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
Democratizing Data Quality Through a Centralized Platform
Learn to Use Databricks for Data Science
Why APM Is Not the Same As ML Monitoring
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
Stage Level Scheduling Improving Big Data and AI Integration
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Scaling your Data Pipelines with Apache Spark on Kubernetes
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
Sawtooth Windows for Feature Aggregations
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Re-imagine Data Monitoring with whylogs and Spark
Raven: End-to-end Optimization of ML Prediction Queries
Processing Large Datasets for ADAS Applications using Apache Spark
Massive Data Processing in Adobe Using Delta Lake

Recently uploaded (20)

PPTX
Introduction to Firewall Analytics - Interfirewall and Transfirewall.pptx
PPTX
Business Acumen Training GuidePresentation.pptx
PDF
“Getting Started with Data Analytics Using R – Concepts, Tools & Case Studies”
PDF
Lecture1 pattern recognition............
PPT
Reliability_Chapter_ presentation 1221.5784
PDF
Business Analytics and business intelligence.pdf
PPTX
Introduction-to-Cloud-ComputingFinal.pptx
PPTX
DISORDERS OF THE LIVER, GALLBLADDER AND PANCREASE (1).pptx
PDF
Mega Projects Data Mega Projects Data
PPTX
oil_refinery_comprehensive_20250804084928 (1).pptx
PPTX
IB Computer Science - Internal Assessment.pptx
PDF
annual-report-2024-2025 original latest.
PPTX
Introduction to machine learning and Linear Models
PPTX
Microsoft-Fabric-Unifying-Analytics-for-the-Modern-Enterprise Solution.pptx
PDF
Galatica Smart Energy Infrastructure Startup Pitch Deck
PPT
Quality review (1)_presentation of this 21
PPTX
1_Introduction to advance data techniques.pptx
PPTX
MODULE 8 - DISASTER risk PREPAREDNESS.pptx
PPTX
climate analysis of Dhaka ,Banglades.pptx
PDF
Fluorescence-microscope_Botany_detailed content
Introduction to Firewall Analytics - Interfirewall and Transfirewall.pptx
Business Acumen Training GuidePresentation.pptx
“Getting Started with Data Analytics Using R – Concepts, Tools & Case Studies”
Lecture1 pattern recognition............
Reliability_Chapter_ presentation 1221.5784
Business Analytics and business intelligence.pdf
Introduction-to-Cloud-ComputingFinal.pptx
DISORDERS OF THE LIVER, GALLBLADDER AND PANCREASE (1).pptx
Mega Projects Data Mega Projects Data
oil_refinery_comprehensive_20250804084928 (1).pptx
IB Computer Science - Internal Assessment.pptx
annual-report-2024-2025 original latest.
Introduction to machine learning and Linear Models
Microsoft-Fabric-Unifying-Analytics-for-the-Modern-Enterprise Solution.pptx
Galatica Smart Energy Infrastructure Startup Pitch Deck
Quality review (1)_presentation of this 21
1_Introduction to advance data techniques.pptx
MODULE 8 - DISASTER risk PREPAREDNESS.pptx
climate analysis of Dhaka ,Banglades.pptx
Fluorescence-microscope_Botany_detailed content

Apache Spark for Library Developers with Erik Erlandson and William Benton

  • 1. Apache Spark for 
 library developers William Benton willb@redhat.com @willb Erik Erlandson eje@redhat.com @manyangled
  • 3. #SAISDD6 The Silex and Isarn libraries Reusable open-source code that works 
 with Spark, factored from internal apps. We’ve tracked Spark releases since Spark 1.3.0. See https://silex.radanalytics.io and 
 http://isarnproject.org
  • 5. #SAISDD6 Forecast Basic considerations for reusable Spark code Generic functions for parallel collections Extending data frames with custom aggregates Exposing JVM libraries to Python Sharing your work with the world
  • 17. #SAISDD6 in your SBT build definition: Cross-building for Scala scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11") in your shell: $ sbt +compile $ sbt "++ 2.11.11" compile scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11")
  • 18. #SAISDD6 in your SBT build definition: Cross-building for Scala scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11") in your shell: $ sbt +compile $ sbt "++ 2.11.11" compile $ sbt +compile # or test, package, publish, etc. $ sbt "++ 2.11.11" compile scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11")
  • 19. #SAISDD6 in your SBT build definition: Cross-building for Scala scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11") in your shell: $ sbt +compile $ sbt "++ 2.11.11" compile $ sbt +compile # or test, package, publish, etc. $ sbt "++ 2.11.11" compile scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11")
  • 20. #SAISDD6 in your SBT build definition: Bring-your-own Spark libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 21. #SAISDD6 in your SBT build definition: Bring-your-own Spark libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 22. #SAISDD6 in your SBT build definition: “Bring-your-own Spark” libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 23. #SAISDD6 in your SBT build definition: “Bring-your-own Spark” libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 27. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } rdd.cache()
  • 28. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } rdd.cache()
  • 29. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } rdd.cache() rdd.unpersist()
  • 30. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result }
  • 31. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result }
  • 32. #SAISDD6 nextModel = modelFromState(newState) current.unpersist } var nextModel = initialModel for (int i = 0; i < iterations; i++) { val current = sc.broadcast(nextModel) val newState = current.unpersist sc.broadcast(nextModel)
  • 33. #SAISDD6 nextModel = modelFromState(newState) current.unpersist } var nextModel = initialModel for (int i = 0; i < iterations; i++) { val current = sc.broadcast(nextModel) val newState = current.unpersist sc.broadcast(nextModel)
  • 34. #SAISDD6 nextModel = modelFromState(newState) current.unpersist } var nextModel = initialModel for (int i = 0; i < iterations; i++) { val current = sc.broadcast(nextModel) val newState = current.unpersist sc.broadcast(nextModel)
  • 35. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
  • 36. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0)) class pointer flags size locks element pointer element pointer class pointer flags size locks 1.0 class pointer flags size locks 3.0 4.0 2.0
  • 37. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0)) class pointer flags size locks element pointer element pointer class pointer flags size locks 1.0 class pointer flags size locks 3.0 4.0 2.0 32 bytes of data…
  • 38. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0)) class pointer flags size locks element pointer element pointer class pointer flags size locks 1.0 class pointer flags size locks 3.0 4.0 2.0 …and 64 bytes of overhead! 32 bytes of data…
  • 39. Continuous integration for Spark libraries and apps
  • 48. Writing generic code for Spark’s parallel collections
  • 49. #SAISDD6 The RDD is invariant T <: U RDD[T] <: RDD[U]
  • 50. #SAISDD6 The RDD is invariant T <: U RDD[T] <: RDD[U] dog animal
  • 51. #SAISDD6 T <: U RDD[T] <: RDD[U] trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
  • 52. #SAISDD6 T <: U RDD[T] <: RDD[U] trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
  • 53. #SAISDD6 val xacts = spark.parallelize(Array( Transaction(1, 1, 1.0), Transaction(2, 2, 1.0) )) badKeyByUserId(xacts) <console>: error: type mismatch; found : org.apache.spark.rdd.RDD[Transaction] required: org.apache.spark.rdd.RDD[HasUserId] Note: Transaction <: HasUserID, but class RDD is invariant in type T. You may wish to define T as +T instead. (SLS 4.5) badKeyByUserId(xacts)
  • 54. #SAISDD6 val xacts = spark.parallelize(Array( Transaction(1, 1, 1.0), Transaction(2, 2, 1.0) )) badKeyByUserId(xacts) <console>: error: type mismatch; found : org.apache.spark.rdd.RDD[Transaction] required: org.apache.spark.rdd.RDD[HasUserId] Note: Transaction <: HasUserID, but class RDD is invariant in type T. You may wish to define T as +T instead. (SLS 4.5) badKeyByUserId(xacts)
  • 56. #SAISDD6 An example: natural join A B C D E A EB X Y
  • 57. #SAISDD6 An example: natural join A B C D E A EB X Y
  • 58. #SAISDD6 An example: natural join A B C D E X Y
  • 59. #SAISDD6 Ad-hoc natural join df1.join(df2, df1("a") === df2("a") && df1("b") === df2("b") && df1("e") === df2("e"))
  • 60. #SAISDD6 = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame
  • 61. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) }
  • 62. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } introspecting over column names
  • 63. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) }
  • 64. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing expressions
  • 65. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing expressions
  • 66. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing expressions
  • 67. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } [left.a === right.a, left.b === right.b, …]
  • 68. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } left.a === right.a && left.b === right.b && …
  • 69. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } left.a === right.a && left.b === right.b && …
  • 70. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) }
  • 71. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing column lists
  • 72. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing column lists
  • 73. #SAISDD6 case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf) case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf)
  • 74. #SAISDD6 case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf) case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf)
  • 75. #SAISDD6 case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf)
  • 76. #SAISDD6 User-defined functions {"a": 1, "b": "wilma", ..., "x": "club"} {"a": 2, "b": "betty", ..., "x": "diamond"} {"a": 3, "b": "fred", ..., "x": "heart"} {"a": 4, "b": "barney", ..., "x": "spade"}
  • 77. #SAISDD6 User-defined functions {"a": 1, "b": "wilma", ..., "x": "club"} {"a": 2, "b": "betty", ..., "x": "diamond"} {"a": 3, "b": "fred", ..., "x": "heart"} {"a": 4, "b": "barney", ..., "x": "spade"} wilma club betty diamond fred heart barney spade
  • 78. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 79. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 80. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 81. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 82. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 83. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) extract_bx = selectively_structure(["b", "x"]) structured_df = df.withColumn("result", extract_bx("json"))
  • 88. #SAISDD6 Working with ML pipelines model.transform(df)
  • 89. #SAISDD6 Working with ML pipelines model.transform(df)
  • 93. #SAISDD6 Spark’s ML pipelines estimator.fit(df) model.transform(df) inputCol epochs seed outputCol
  • 95. #SAISDD6 Forecast Basic considerations for reusable Spark code Generic functions for parallel collections Extending data frames with custom aggregates Exposing JVM libraries to Python Sharing your work with the world
  • 104. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 105. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 106. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 107. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 108. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 109. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 110. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 111. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 112. #SAISDD6 Four main functions: initialize initialize
  • 113. #SAISDD6 Four main functions: initialize initialize
  • 114. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 115. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 116. #SAISDD6 Four main functions: evaluate evaluate
  • 117. #SAISDD6 Four main functions: evaluate evaluate
  • 118. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 119. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 122. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 123. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 124. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 125. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 126. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 127. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 128. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 129. #SAISDD6 Four main functions: merge 1 merge 2
  • 130. #SAISDD6 Four main functions: merge merge 1 + 2
  • 131. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 132. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 134. #SAISDD6 User-defined types package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // .... package org.apache.spark
  • 135. #SAISDD6 package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // .... User-defined types package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // ....
  • 136. #SAISDD6 package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // .... User-defined types package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // ....
  • 137. #SAISDD6 Implementing custom types class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 138. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 139. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 140. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 141. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 142. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 143. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 144. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 145. #SAISDD6 def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) } def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) }
  • 146. #SAISDD6 def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) } def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) }
  • 147. #SAISDD6 def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) } def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) }
  • 148. Extending PySpark with your Scala library
  • 153. #SAISDD6 # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing = sparkJVM.com.path.to.this.thing # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing = sparkJVM.com.path.to.this.thing
  • 154. #SAISDD6 # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing = sparkJVM.com.path.to.this.thing # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm
  • 155. #SAISDD6 # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing = sparkJVM.com.path.to.this.thing # use the gateway to access JVM objects and classes thisThing = sparkJVM.com.path.to.this.thing
  • 156. #SAISDD6 A Python-friendly wrapper package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) } package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) }
  • 157. #SAISDD6 package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) } tdigestDoubleUDAF
  • 158. #SAISDD6 package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) } Double
  • 159. #SAISDD6 from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column))) from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
  • 160. #SAISDD6 from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column))) tdapply apply
  • 161. #SAISDD6 from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column)))tdapply(_to_seq(sc, [col], _to_java_column))
  • 162. #SAISDD6 class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) # ...
  • 163. #SAISDD6 class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) # ...
  • 164. #SAISDD6 class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest" class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest"
  • 165. #SAISDD6 class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest" class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest"
  • 166. #SAISDD6 class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest" class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest"
  • 167. #SAISDD6 class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (obj.delta, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4]) class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (obj.delta, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
  • 168. #SAISDD6 class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (obj.delta, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4]) class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (obj.delta, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
  • 169. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { // ... override def pyUDT: String = “isarnproject.sketches.udt.tdigest.TDigestUDT" } class TDigestUDT extends UserDefinedType[TDigestSQL] { // ... override def pyUDT: String = “isarnproject.sketches.udt.tdigest.TDigestUDT" }
  • 170. #SAISDD6 Python code in JAR files mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 171. #SAISDD6 mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 172. #SAISDD6 mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 173. #SAISDD6 mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 174. #SAISDD6 Cross-building for Python lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython)
  • 175. #SAISDD6 lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython)
  • 176. #SAISDD6 lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
  • 177. #SAISDD6 Using versioned JAR files $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7' $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
  • 178. #SAISDD6 Using versioned JAR files $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7' $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
  • 179. #SAISDD6 Using versioned JAR files $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7' $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
  • 180. Show your work: 
 publishing results
  • 181. #SAISDD6 Developing with git-flow $ brew install git-flow # macOS $ dnf install git-flow # Fedora $ yum install git-flow # CentOS $ apt-get install git-flow # Debian and friends (Search the internet for “git flow” to learn more!)
  • 182. #SAISDD6 # Set up git-flow in this repository $ git flow init # Start work on my-awesome-feature; create # and switch to a feature branch $ git flow feature start my-awesome-feature $ ... # Finish work on my-awesome-feature; merge # feature/my-awesome-feature to develop $ git flow feature finish my-awesome-feature
  • 183. #SAISDD6 # Start work on a release branch $ git flow release start 0.1.0 # Hack and bump version numbers $ ... # Finish work on v0.1.0; merge # release/0.1.0 to develop and master; # tag v0.1.0 $ git flow release finish 0.1.0
  • 188. #SAISDD6 Maven Central Bintray not really easy to set up for library developers trivial trivial easy to set up for library users mostly yes, via sbt easy to publish yes, via sbt + plugins yes easy to resolve artifacts mostly