SlideShare a Scribd company logo
Introduc)on	
  to	
  
Scalding	
  and	
  Monoids	
  

Hugo	
  Gävert	
  
@hgavert	
  
	
  
Map	
  Reduce	
  
•  Programming	
  model	
  for	
  processing	
  large	
  data	
  sets	
  
with	
  a	
  parallel,	
  distributed	
  algorithm	
  on	
  a	
  cluster.	
  
•  Inspired	
  by	
  map	
  and	
  reduce	
  func)ons	
  commonly	
  
found	
  in	
  func)onal	
  programming	
  languages	
  
•  map()	
  performs	
  transla)ons	
  and	
  filtering	
  on	
  given	
  values	
  
•  reduce()	
  performs	
  summary	
  opera)on	
  on	
  given	
  values	
  
How	
  does	
  it	
  work?	
  

Found	
  this	
  from	
  the	
  Internet,	
  forgot	
  from	
  where	
  
The	
  scene	
  
•  Hadoop	
  –	
  open	
  source	
  implementa)on	
  of	
  	
  Google’s	
  
MapReduce	
  and	
  Google	
  File	
  System	
  papers	
  
•  Java…	
  
•  Higher	
  level	
  frameworks/plaOorms	
  
–  Hive	
  ≈	
  SQL	
  
–  Pig	
  	
  	
  (procedural	
  ≈	
  “more	
  programming	
  than	
  SQL”)	
  
–  Cascading	
  –	
  Java	
  MR	
  applica)on	
  framework	
  for	
  enterprise	
  data	
  flows	
  
•  If	
  you	
  must	
  do	
  Java,	
  do	
  this!	
  

–  Scalding	
  	
  -­‐	
  Scala	
  DSL	
  for	
  Cascading,	
  easy	
  to	
  pick	
  up	
  yet	
  very	
  
powerful	
  
–  Cascalog	
  –	
  Clojure	
  DSL	
  for	
  Cascading,	
  declara)ve,	
  logic	
  
programming	
  
The	
  scene	
  (*)	
  

*	
  Borrowed	
  from	
  excellent	
  presenta)on	
  by	
  Vitaly	
  Gordon	
  and	
  Christopher	
  Severs	
  	
  
“Hadoop	
  is	
  a	
  distributed	
  system	
  
for	
  coun)ng	
  words”	
  
package	
  org.myorg;	
  
	
  	
  	
  
import	
  java.io.IOException;	
  
import	
  java.util.*;	
  
	
  	
  
import	
  org.apache.hadoop.fs.Path;	
  
import	
  org.apache.hadoop.conf.*;	
  
import	
  org.apache.hadoop.io.*;	
  
import	
  org.apache.hadoop.mapred.*;	
  
import	
  org.apache.hadoop.util.*;	
  
	
  	
  
public	
  class	
  WordCount	
  {	
  
	
  	
  
	
  	
  	
  	
  public	
  static	
  class	
  Map	
  extends	
  MapReduceBase	
  implements	
  
Mapper<LongWritable,	
  Text,	
  Text,	
  IntWritable>	
  {	
  
	
  	
  	
  	
  	
  	
  private	
  final	
  static	
  IntWritable	
  one	
  =	
  new	
  
IntWritable(1);	
  
	
  	
  	
  	
  	
  	
  private	
  Text	
  word	
  =	
  new	
  Text();	
  
	
  	
  
	
  	
  	
  	
  	
  	
  public	
  void	
  map(LongWritable	
  key,	
  Text	
  value,	
  
OutputCollector<Text,	
  IntWritable>	
  output,	
  Reporter	
  reporter)	
  
throws	
  IOException	
  {	
  
	
  	
  	
  	
  	
  	
  	
  	
  String	
  line	
  =	
  value.toString();	
  
	
  	
  	
  	
  	
  	
  	
  	
  StringTokenizer	
  tokenizer	
  =	
  new	
  StringTokenizer(line);	
  
	
  	
  	
  	
  	
  	
  	
  	
  while	
  (tokenizer.hasMoreTokens())	
  {	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  word.set(tokenizer.nextToken());	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  output.collect(word,	
  one);	
  
	
  	
  	
  	
  	
  	
  	
  	
  }	
  
	
  	
  	
  	
  	
  	
  }	
  
	
  	
  	
  	
  }	
  
	
  	
  

	
  	
  
	
  	
  	
  	
  public	
  static	
  class	
  Reduce	
  extends	
  MapReduceBase	
  implements	
  
Reducer<Text,	
  IntWritable,	
  Text,	
  IntWritable>	
  {	
  
	
  	
  	
  	
  	
  	
  public	
  void	
  reduce(Text	
  key,	
  Iterator<IntWritable>	
  values,	
  
OutputCollector<Text,	
  IntWritable>	
  output,	
  Reporter	
  reporter)	
  
throws	
  IOException	
  {	
  
	
  	
  	
  	
  	
  	
  	
  	
  int	
  sum	
  =	
  0;	
  
	
  	
  	
  	
  	
  	
  	
  	
  while	
  (values.hasNext())	
  {	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  sum	
  +=	
  values.next().get();	
  
	
  	
  	
  	
  	
  	
  	
  	
  }	
  
	
  	
  	
  	
  	
  	
  	
  	
  output.collect(key,	
  new	
  IntWritable(sum));	
  
	
  	
  	
  	
  	
  	
  }	
  
	
  	
  	
  	
  }	
  
	
  	
  
	
  	
  	
  	
  public	
  static	
  void	
  main(String[]	
  args)	
  throws	
  Exception	
  {	
  
	
  	
  	
  	
  	
  	
  JobConf	
  conf	
  =	
  new	
  JobConf(WordCount.class);	
  
	
  	
  	
  	
  	
  	
  conf.setJobName("wordcount");	
  
	
  	
  
	
  	
  	
  	
  	
  	
  conf.setOutputKeyClass(Text.class);	
  
	
  	
  	
  	
  	
  	
  conf.setOutputValueClass(IntWritable.class);	
  
	
  	
  
	
  	
  	
  	
  	
  	
  conf.setMapperClass(Map.class);	
  
	
  	
  	
  	
  	
  	
  conf.setCombinerClass(Reduce.class);	
  
	
  	
  	
  	
  	
  	
  conf.setReducerClass(Reduce.class);	
  
	
  	
  
	
  	
  	
  	
  	
  	
  conf.setInputFormat(TextInputFormat.class);	
  
	
  	
  	
  	
  	
  	
  conf.setOutputFormat(TextOutputFormat.class);	
  
	
  	
  
	
  	
  	
  	
  	
  	
  FileInputFormat.setInputPaths(conf,	
  new	
  Path(args[0]));	
  
	
  	
  	
  	
  	
  	
  FileOutputFormat.setOutputPath(conf,	
  new	
  Path(args[1]));	
  
	
  	
  
	
  	
  	
  	
  	
  	
  JobClient.runJob(conf);	
  
	
  	
  	
  	
  }	
  
}	
  
	
  
What	
  do	
  we	
  actually	
  want	
  to	
  do?	
  
Documents	
  
(lines)	
  

Tokenize	
  

GroupBy	
  
(token)	
  

Count	
  

Word	
  
count	
  
Word	
  Count	
  in	
  Scalding	
  
•  asd	
  

package	
  com.sanoma.cda.examples	
  
import	
  com.twitter.scalding._	
  
	
  	
  
class	
  WordCount1(args:	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  TextLine(args("input"))	
  
	
  	
  	
  	
  .flatMap('line	
  -­‐>	
  'word)	
  {	
  line:	
  String	
  =>	
  line.split("s+")	
  }	
  
	
  	
  	
  	
  .groupBy('word)	
  {	
  _.size	
  }	
  
	
  	
  	
  	
  .write(Tsv(args("output")))	
  
}	
  
There	
  is	
  scald.rb	
  to	
  get	
  you	
  started	
  (get	
  it	
  from	
  Github	
  project)	
  
	
  
Building	
  and	
  running	
  a	
  fat	
  jar	
  (for	
  local,	
  include	
  hadoop,	
  for	
  cluster	
  mark	
  it	
  “provided”):	
  
> sbt assembly
> java -jar target/scala-2.10/scalding_talk-assembly-0.1.jar
com.sanoma.cda.examples.WordCount1 --local
--input data/11.txt.utf-8 --output wc.txt
> hadoop jar job-jars/scalding_talk-assembly-0.1.jar
--Dmapred.reduce.tasks=70 com.sanoma.cda.examples.WordCount1 --hdfs
--input /data/AliceInWonderland --output /user/Alice_wc
	
  

the
and
to
a
of
she
said
in
it
was
you
I
as
that
Alice
…	
  
Alice,
Alice.
Alice;
Alice's
Alice:
(Alice
Alice!
Alice,)

	
  1664	
  
	
  1172	
  
	
  780	
  
	
  773	
  
	
  662	
  
	
  596	
  
	
  484	
  
	
  416	
  
	
  401	
  
	
  356	
  
	
  329	
  
	
  301	
  
	
  260	
  
	
  246	
  
	
  226	
  
	
  221	
  
	
  76	
  
	
  54	
  
	
  16	
  
	
  11	
  
	
  7	
  
	
  4	
  
	
  3	
  
	
  2	
  
Word	
  Count	
  in	
  Scalding	
  
•  asd	
  

package	
  com.sanoma.cda.examples	
  
import	
  com.twitter.scalding._	
  
	
  	
  
class	
  WordCount2(args:	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  TextLine(args("input"))	
  
	
  	
  	
  	
  .flatMap('line	
  -­‐>	
  'word)	
  {	
  line:	
  String	
  =>	
  tokenize(line)	
  }	
  
	
  	
  	
  	
  .filter('word)	
  {	
  word:	
  String	
  =>	
  word	
  !=	
  ""	
  }	
  
	
  	
  	
  	
  .groupBy('word)	
  {	
  _.size	
  }	
  
	
  
	
  	
  	
  	
  .groupAll{	
  _.sortBy(('size,	
  'word)).reverse	
  }	
  //	
  this	
  is	
  just	
  for	
  easy	
  results	
  
	
  	
  	
  	
  .write(Tsv(args("output")))	
  
	
  	
  
	
  	
  def	
  tokenize(text:	
  String):	
  Array[String]	
  =	
  {	
  
	
  	
  	
  	
  text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",	
  "").split("s+")	
  
	
  	
  }	
  
}	
  

the
and
to
a
of
it
she
said
you
in
i
alice
was
that
as
her
with
at
on
all

	
  1804	
  
	
  912	
  
	
  801	
  
	
  684	
  
	
  625	
  
	
  541	
  
	
  538	
  
	
  462	
  
	
  429	
  
	
  428	
  
	
  400	
  
	
  385	
  
	
  358	
  
	
  291	
  
	
  272	
  
	
  248	
  
	
  228	
  
	
  224	
  
	
  204	
  
	
  197	
  
Word	
  count	
  in	
  Scalding	
  
Almost	
  1-­‐to-­‐1	
  rela)on	
  between	
  the	
  
process	
  and	
  the	
  Scalding	
  code!	
  
	
  
UDFs	
  directly	
  in	
  Scala	
  
And	
  Java	
  libraries	
  can	
  be	
  used	
  

Documents	
  
(lines)	
  

Tokenize	
  

package	
  com.sanoma.cda.examples	
  
import	
  com.twitter.scalding._	
  
	
  	
  
class	
  WordCount2(args:	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  TextLine(args("input"))	
  
	
  	
  	
  	
  .flatMap('line	
  -­‐>	
  'word)	
  {	
  tokenize	
  }	
  
	
  	
  	
  	
  .groupBy('word)	
  {	
  _.size	
  }	
  
	
  	
  	
  	
  .write(Tsv(args("output")))	
  
	
  	
  
	
  	
  def	
  tokenize(text:	
  String):	
  Array[String]	
  =	
  {	
  
	
  	
  	
  	
  text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",	
  "").split("s+")	
  
	
  	
  }	
  
}	
  
	
  

GroupBy	
  
(token)	
  

Count	
  

Word	
  
count	
  
About	
  Scalding	
  
•  Started	
  at	
  Twiper	
  –	
  years	
  of	
  produc)on	
  use	
  
•  Well	
  tested	
  and	
  op)mized	
  by	
  different	
  teams,	
  
including	
  Twiper,	
  Concurrent	
  Inc.,	
  Etsy,	
  …	
  
•  Has	
  very	
  fast	
  local	
  mode	
  (no	
  need	
  to	
  install	
  
Hadoop	
  locally)	
  
•  Flow	
  planner	
  is	
  designed	
  to	
  be	
  portable	
  à	
  in	
  
future,	
  the	
  same	
  jobs	
  might	
  run	
  on	
  Storm	
  cluster	
  
for	
  example	
  
•  Scala…	
  very	
  nice	
  programming	
  language	
  –	
  YMMV	
  
–  Func)onal	
  &	
  object	
  oriented,	
  has	
  REPL	
  
Scalding	
  Func)ons	
  
•  3	
  APIs:	
  

–  Fields-­‐based	
  API	
  –	
  easy	
  to	
  start	
  from	
  here	
  
–  Type-­‐safe	
  API	
  
–  Matrix	
  API	
  

•  Field-­‐based	
  API	
  

–  Map-­‐like	
  func)ons	
  

•  map,	
  flatMap,	
  project,	
  insert,	
  filter,	
  limit…	
  

–  Grouping/reducing	
  func)ons	
  

•  groupBy,	
  groupAll	
  
•  .size,	
  .sum,	
  .average,	
  .sizeAveStdev,	
  .toList,	
  .max,	
  
sortBy,	
  .reduce,	
  .foldLeu,	
  .pivot,	
  …	
  

–  Join	
  Opera)ons	
  

•  joinWithSmaller,	
  joinWithLarger,	
  joinWithTiny,	
  crossWithTiny	
  
•  InnerJoin,	
  LeuJoin,	
  RightJoin,	
  OuterJoin	
  
Scalding	
  matrix	
  API	
  
package	
  com.twitter.scalding.examples	
  
import	
  com.twitter.scalding._	
  
import	
  com.twitter.scalding.mathematics.Matrix	
  
	
  	
  
/**	
  
*	
  Loads	
  a	
  directed	
  graph	
  adjacency	
  matrix	
  where	
  a[i,j]	
  =	
  1	
  if	
  there	
  is	
  an	
  edge	
  from	
  a[i]	
  to	
  b[j]	
  
*	
  and	
  computes	
  the	
  cosine	
  of	
  the	
  angle	
  between	
  every	
  two	
  pairs	
  of	
  vectors	
  
*/	
  
class	
  ComputeCosineJob(args	
  :	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  import	
  Matrix._	
  
	
  	
  
	
  	
  val	
  adjacencyMatrix	
  =	
  Tsv(	
  args("input"),	
  ('user1,	
  'user2,	
  'rel)	
  )	
  
	
  	
  	
  	
  .read	
  
	
  	
  	
  	
  .toMatrix[Long,Long,Double]('user1,	
  'user2,	
  'rel)	
  
	
  	
  
	
  	
  //	
  we	
  compute	
  the	
  L2	
  normalized	
  adjacency	
  graph	
  	
  
	
  	
  val	
  matL2Norm	
  =	
  adjacencyMatrix.rowL2Normalize	
  
	
  
	
  	
  //	
  we	
  compute	
  the	
  innerproduct	
  of	
  the	
  normalized	
  matrix	
  with	
  itself	
  
	
  	
  //	
  which	
  is	
  equivalent	
  with	
  computing	
  cosine:	
  AA^T	
  /	
  ||A||	
  *	
  ||A||	
  
	
  	
  val	
  cosDist	
  =	
  matL2Norm	
  *	
  matL2Norm.transpose	
  
	
  
	
  	
  cosDist.write(Tsv(args("output”)))	
  
}	
  
	
  
Introduction to Scalding and Monoids
What	
  is	
  a	
  monoid?	
  
•  Closure	
  

∀a, b ∈ T : a • b ∈ T

•  Associa)vity	
  

∀a, b, c ∈ T : (a • b)•c = a •(b •c)

•  Iden)ty	
  element	
  

∃I ∈ T : ∀a ∈ T : I • a = a • I = a

Scala	
  trait:	
  

trait	
  Monoid[T]	
  {	
  
	
  	
  	
  def	
  zero:	
  T	
  
	
  	
  	
  def	
  plus(left:	
  T,	
  right:	
  T):	
  T	
  
}	
  
Examples	
  of	
  monoids	
  
•  Numbers,	
  String,	
  list,	
  set,	
  map	
  
•  Algorithms:	
  	
  
–  Min,	
  Max	
  
–  Moments	
  (count,	
  mean,	
  std,	
  …)	
  
–  Approximate	
  histograms,	
  quan)les	
  
–  Approximate	
  data	
  structures	
  
•  Bloom	
  Filter,	
  CountMinSketch,	
  HyperLogLog	
  

–  Stochas)c	
  gradient	
  descent	
  
What’s	
  the	
  point?	
  
a0	
  +	
  a1	
  +	
  a2	
  +	
  a3	
  +	
  a4	
  +	
  a5	
  +	
  a6	
  +	
  a7	
  
	
  

(a0	
  +	
  a1)	
  +	
  (a2	
  +	
  a3)	
  +	
  (a4	
  +	
  a5)	
  +	
  (a6	
  +	
  a7)	
  
	
  

(	
  (a0	
  +	
  a1)	
  +	
  (a2	
  +	
  a3)	
  )	
  +	
  (	
  (a4	
  +	
  a5)	
  +	
  (a6	
  +	
  a7)	
  )	
  
	
  

(	
  (	
  (a0	
  +	
  a1)	
  +	
  (a2	
  +	
  a3)	
  )	
  +	
  (	
  (a4	
  +	
  a5)	
  +	
  (a6	
  +	
  a7)	
  )	
  )	
  
à	
  Parallelism	
  
What’s	
  the	
  point?	
  
a0	
  +	
  a1	
  +	
  a2	
  +	
  a3	
  +	
  a4	
  +	
  a5	
  +	
  a6	
  +	
  a7	
  
	
  

	
  	
  	
  	
  	
  	
  	
  	
  (a0	
  +	
  a1	
  +	
  a2	
  +	
  a3	
  +	
  a4	
  +	
  a5	
  +	
  a6	
  +	
  a7)	
  +	
  a8	
  
	
  

à	
  Incremental	
  aggrega)on	
  
What’s	
  the	
  point?	
  
•  Easily	
  unit	
  testable	
  opera)ons	
  
•  Simple	
  aggrega)on	
  code	
  

à	
  Beper	
  quality	
  
Word	
  Count	
  with	
  Map	
  Monoid	
  
package	
  com.sanoma.cda.examples	
  
import	
  com.twitter.scalding._	
  
import	
  com.twitter.algebird.Operators._	
  
	
  	
  
class	
  WordCount3(args:	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  TextLine(args("input"))	
  
	
  	
  	
  	
  .flatMap('line	
  -­‐>	
  'word)	
  {	
  tokenize	
  }	
  
	
  	
  	
  	
  .map('word	
  -­‐>	
  'word)	
  {	
  w:	
  String	
  =>	
  Map[String,	
  Int](w	
  -­‐>	
  1)	
  }	
  
	
  	
  	
  	
  .groupAll{	
  _.sum[Map[String,	
  Int]]('word)	
  }	
  
	
  
	
  	
  	
  	
  //	
  We	
  could	
  save	
  the	
  map	
  here,	
  but	
  if	
  we	
  want	
  similar	
  output	
  as	
  in	
  previous...	
  
	
  	
  	
  	
  .flatMap('word	
  -­‐>	
  ('word,	
  'size))	
  {	
  words:	
  Map[String,	
  Int]	
  =>	
  words.toList	
  }	
  
	
  	
  	
  	
  .groupAll{	
  _.sortBy(('size,	
  'word)).reverse	
  }	
  //	
  this	
  is	
  just	
  for	
  easy	
  results	
  
	
  	
  	
  	
  .write(Tsv(args("output")))	
  
	
  	
  
	
  	
  def	
  tokenize(text:	
  String):	
  Array[String]	
  =	
  {	
  
	
  	
  	
  	
  text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",	
  "").split("s+").filter(	
  _	
  !=	
  "")	
  
	
  	
  }	
  
}	
  

the
and
to
a
of
it
she
said
you
in
i
alice
was
that
as
her
with
at
on
all

	
  1804	
  
	
  912	
  
	
  801	
  
	
  684	
  
	
  625	
  
	
  541	
  
	
  538	
  
	
  462	
  
	
  429	
  
	
  428	
  
	
  400	
  
	
  385	
  
	
  358	
  
	
  291	
  
	
  272	
  
	
  248	
  
	
  228	
  
	
  224	
  
	
  204	
  
	
  197	
  
Top	
  Words	
  with	
  CMS	
  
•  asd	
  

package	
  com.sanoma.cda.examples	
  
import	
  com.twitter.scalding._	
  
import	
  com.twitter.algebird._	
  
	
  	
  
class	
  WordCount5(args:	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  implicit	
  def	
  utf8(s:	
  String):	
  Array[Byte]	
  =	
  com.twitter.bijection.Injection.utf8(s)	
  
	
  	
  implicit	
  val	
  cmsm	
  =	
  new	
  SketchMapMonoid[String,	
  Long](128,	
  6,	
  0,	
  20)	
  //	
  top	
  20	
  
	
  	
  type	
  ApproxMap	
  =	
  SketchMap[String,	
  Long]	
  
	
  	
  
	
  	
  TextLine(args("input"))	
  
	
  	
  	
  	
  .flatMap('line	
  -­‐>	
  'word)	
  {	
  tokenize	
  }	
  
	
  	
  	
  	
  .map('word	
  -­‐>	
  'word)	
  {	
  w:	
  String	
  =>	
  cmsm.create((w,	
  1L))	
  }	
  
	
  	
  	
  	
  .groupAll{	
  _.sum[ApproxMap]('word)	
  }	
  
	
  	
  	
  	
  .flatMap('word	
  -­‐>	
  ('word,	
  'size))	
  {	
  words:	
  ApproxMap	
  =>	
  words.heavyHitters	
  }	
  
	
  	
  	
  	
  .write(Tsv(args("output")))	
  
	
  	
  
	
  	
  def	
  tokenize(text:	
  String):	
  Array[String]	
  =	
  {	
  
	
  	
  	
  	
  text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",	
  "").split("s+").filter(	
  _	
  !=	
  "")	
  
	
  	
  }	
  
}	
  

the
and
to
a
of
she
it
said
you
in
i
alice
at
was
that
her
with
as
not
be

	
  1859	
  
	
  972	
  
	
  867	
  
	
  748	
  
	
  711	
  
	
  636	
  
	
  619	
  
	
  579	
  
	
  504	
  
	
  495	
  
	
  456	
  
	
  431	
  
	
  407	
  
	
  394	
  
	
  342	
  
	
  341	
  
	
  338	
  
	
  337	
  
	
  290	
  
	
  286	
  
Start	
  using	
  Scalding	
  now!	
  :-­‐)	
  
GitHub:	
  
hpps://github.com/twiper/scalding	
  
	
  
Tutorials:	
  
hpps://github.com/twiper/scalding/tree/
develop/tutorial	
  
	
  
	
  
Thanks!	
  
	
  
	
  
Hugo	
  Gävert	
  
Sanoma	
  

More Related Content

PPTX
Should I Use Scalding or Scoobi or Scrunch?
PDF
Scalding - Hadoop Word Count in LESS than 70 lines of code
PPTX
Scoobi - Scala for Startups
PDF
Scalding for Hadoop
KEY
Scalding: Twitter's Scala DSL for Hadoop/Cascading
PPTX
Writing Hadoop Jobs in Scala using Scalding
PDF
Scalding - the not-so-basics @ ScalaDays 2014
PPTX
Scalding: Reaching Efficient MapReduce
Should I Use Scalding or Scoobi or Scrunch?
Scalding - Hadoop Word Count in LESS than 70 lines of code
Scoobi - Scala for Startups
Scalding for Hadoop
Scalding: Twitter's Scala DSL for Hadoop/Cascading
Writing Hadoop Jobs in Scala using Scalding
Scalding - the not-so-basics @ ScalaDays 2014
Scalding: Reaching Efficient MapReduce

What's hot (20)

PDF
PDF
Spark workshop
PDF
Scala introduction
PDF
Meet scala
PDF
A deeper-understanding-of-spark-internals
PDF
Spark schema for free with David Szakallas
PPTX
A Brief Intro to Scala
PDF
Scala+data
PDF
Spark Schema For Free with David Szakallas
PDF
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
PDF
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
PDF
Good and Wicked Fairies, and the Tragedy of the Commons: Understanding the Pe...
PDF
Cascading Through Hadoop for the Boulder JUG
PDF
Big Data Analytics with Scala at SCALA.IO 2013
PDF
Data profiling with Apache Calcite
PDF
Introduction to Scala for Java Developers
PDF
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
PPTX
Joy of scala
PDF
Scala for Java programmers
PDF
Introduction to Hadoop and MapReduce
Spark workshop
Scala introduction
Meet scala
A deeper-understanding-of-spark-internals
Spark schema for free with David Szakallas
A Brief Intro to Scala
Scala+data
Spark Schema For Free with David Szakallas
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Good and Wicked Fairies, and the Tragedy of the Commons: Understanding the Pe...
Cascading Through Hadoop for the Boulder JUG
Big Data Analytics with Scala at SCALA.IO 2013
Data profiling with Apache Calcite
Introduction to Scala for Java Developers
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
Joy of scala
Scala for Java programmers
Introduction to Hadoop and MapReduce
Ad

Viewers also liked (8)

PDF
Spark at Twitter - Seattle Spark Meetup, April 2014
PPTX
How LinkedIn Uses Scalding for Data Driven Product Development
PDF
Scalding @ Coursera
PDF
Analytics with Splunk (Open edX)
PDF
Luigi presentation OA Summit
PDF
Sterilization and disinfection
PDF
Luigi presentation NYC Data Science
PPTX
A Beginner's Guide to Building Data Pipelines with Luigi
Spark at Twitter - Seattle Spark Meetup, April 2014
How LinkedIn Uses Scalding for Data Driven Product Development
Scalding @ Coursera
Analytics with Splunk (Open edX)
Luigi presentation OA Summit
Sterilization and disinfection
Luigi presentation NYC Data Science
A Beginner's Guide to Building Data Pipelines with Luigi
Ad

Similar to Introduction to Scalding and Monoids (20)

PPTX
Scalable and Flexible Machine Learning With Scala @ LinkedIn
PPTX
Scalding by Adform Research, Alex Gryzlov
PDF
Introducción a hadoop
PDF
Why hadoop map reduce needs scala, an introduction to scoobi and scalding
PDF
Solving real world problems with Hadoop
PPTX
Hadoop and Mapreduce for .NET User Group
PDF
JRubyKaigi2010 Hadoop Papyrus
PPTX
Scalding by Adform Research, Alex Gryzlov
KEY
Dachis group pigout_101
PDF
Beyond Map/Reduce: Getting Creative With Parallel Processing
PDF
Hadoop ecosystem
PDF
Scalding: Twitter's New DSL for Hadoop
PPTX
Hadoop ecosystem
PPTX
Presentation on functional data mining at the IGT Cloud meet up at eBay Netanya
PDF
Distributed Computing with Apache Hadoop. Introduction to MapReduce.
PPTX
Big Data & Analytics MapReduce/Hadoop – A programmer’s perspective
 
PPTX
EuroPython 2015 - Big Data with Python and Hadoop
PDF
Mapreduce by examples
PPTX
TheEdge10 : Big Data is Here - Hadoop to the Rescue
Scalable and Flexible Machine Learning With Scala @ LinkedIn
Scalding by Adform Research, Alex Gryzlov
Introducción a hadoop
Why hadoop map reduce needs scala, an introduction to scoobi and scalding
Solving real world problems with Hadoop
Hadoop and Mapreduce for .NET User Group
JRubyKaigi2010 Hadoop Papyrus
Scalding by Adform Research, Alex Gryzlov
Dachis group pigout_101
Beyond Map/Reduce: Getting Creative With Parallel Processing
Hadoop ecosystem
Scalding: Twitter's New DSL for Hadoop
Hadoop ecosystem
Presentation on functional data mining at the IGT Cloud meet up at eBay Netanya
Distributed Computing with Apache Hadoop. Introduction to MapReduce.
Big Data & Analytics MapReduce/Hadoop – A programmer’s perspective
 
EuroPython 2015 - Big Data with Python and Hadoop
Mapreduce by examples
TheEdge10 : Big Data is Here - Hadoop to the Rescue

Recently uploaded (20)

PPTX
20250228 LYD VKU AI Blended-Learning.pptx
PDF
Modernizing your data center with Dell and AMD
PDF
Unlocking AI with Model Context Protocol (MCP)
PDF
solutions_manual_-_materials___processing_in_manufacturing__demargo_.pdf
PDF
Empathic Computing: Creating Shared Understanding
PDF
Dropbox Q2 2025 Financial Results & Investor Presentation
PDF
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf
DOCX
The AUB Centre for AI in Media Proposal.docx
PDF
Per capita expenditure prediction using model stacking based on satellite ima...
PDF
GDG Cloud Iasi [PUBLIC] Florian Blaga - Unveiling the Evolution of Cybersecur...
PDF
Machine learning based COVID-19 study performance prediction
PDF
Chapter 3 Spatial Domain Image Processing.pdf
PPT
“AI and Expert System Decision Support & Business Intelligence Systems”
PDF
Shreyas Phanse Resume: Experienced Backend Engineer | Java • Spring Boot • Ka...
PDF
Spectral efficient network and resource selection model in 5G networks
PPTX
MYSQL Presentation for SQL database connectivity
PDF
Network Security Unit 5.pdf for BCA BBA.
PDF
Electronic commerce courselecture one. Pdf
PDF
Review of recent advances in non-invasive hemoglobin estimation
PDF
Advanced Soft Computing BINUS July 2025.pdf
20250228 LYD VKU AI Blended-Learning.pptx
Modernizing your data center with Dell and AMD
Unlocking AI with Model Context Protocol (MCP)
solutions_manual_-_materials___processing_in_manufacturing__demargo_.pdf
Empathic Computing: Creating Shared Understanding
Dropbox Q2 2025 Financial Results & Investor Presentation
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf
The AUB Centre for AI in Media Proposal.docx
Per capita expenditure prediction using model stacking based on satellite ima...
GDG Cloud Iasi [PUBLIC] Florian Blaga - Unveiling the Evolution of Cybersecur...
Machine learning based COVID-19 study performance prediction
Chapter 3 Spatial Domain Image Processing.pdf
“AI and Expert System Decision Support & Business Intelligence Systems”
Shreyas Phanse Resume: Experienced Backend Engineer | Java • Spring Boot • Ka...
Spectral efficient network and resource selection model in 5G networks
MYSQL Presentation for SQL database connectivity
Network Security Unit 5.pdf for BCA BBA.
Electronic commerce courselecture one. Pdf
Review of recent advances in non-invasive hemoglobin estimation
Advanced Soft Computing BINUS July 2025.pdf

Introduction to Scalding and Monoids

  • 1. Introduc)on  to   Scalding  and  Monoids   Hugo  Gävert   @hgavert    
  • 2. Map  Reduce   •  Programming  model  for  processing  large  data  sets   with  a  parallel,  distributed  algorithm  on  a  cluster.   •  Inspired  by  map  and  reduce  func)ons  commonly   found  in  func)onal  programming  languages   •  map()  performs  transla)ons  and  filtering  on  given  values   •  reduce()  performs  summary  opera)on  on  given  values  
  • 3. How  does  it  work?   Found  this  from  the  Internet,  forgot  from  where  
  • 4. The  scene   •  Hadoop  –  open  source  implementa)on  of    Google’s   MapReduce  and  Google  File  System  papers   •  Java…   •  Higher  level  frameworks/plaOorms   –  Hive  ≈  SQL   –  Pig      (procedural  ≈  “more  programming  than  SQL”)   –  Cascading  –  Java  MR  applica)on  framework  for  enterprise  data  flows   •  If  you  must  do  Java,  do  this!   –  Scalding    -­‐  Scala  DSL  for  Cascading,  easy  to  pick  up  yet  very   powerful   –  Cascalog  –  Clojure  DSL  for  Cascading,  declara)ve,  logic   programming  
  • 5. The  scene  (*)   *  Borrowed  from  excellent  presenta)on  by  Vitaly  Gordon  and  Christopher  Severs    
  • 6. “Hadoop  is  a  distributed  system   for  coun)ng  words”   package  org.myorg;         import  java.io.IOException;   import  java.util.*;       import  org.apache.hadoop.fs.Path;   import  org.apache.hadoop.conf.*;   import  org.apache.hadoop.io.*;   import  org.apache.hadoop.mapred.*;   import  org.apache.hadoop.util.*;       public  class  WordCount  {              public  static  class  Map  extends  MapReduceBase  implements   Mapper<LongWritable,  Text,  Text,  IntWritable>  {              private  final  static  IntWritable  one  =  new   IntWritable(1);              private  Text  word  =  new  Text();                  public  void  map(LongWritable  key,  Text  value,   OutputCollector<Text,  IntWritable>  output,  Reporter  reporter)   throws  IOException  {                  String  line  =  value.toString();                  StringTokenizer  tokenizer  =  new  StringTokenizer(line);                  while  (tokenizer.hasMoreTokens())  {                      word.set(tokenizer.nextToken());                      output.collect(word,  one);                  }              }          }                  public  static  class  Reduce  extends  MapReduceBase  implements   Reducer<Text,  IntWritable,  Text,  IntWritable>  {              public  void  reduce(Text  key,  Iterator<IntWritable>  values,   OutputCollector<Text,  IntWritable>  output,  Reporter  reporter)   throws  IOException  {                  int  sum  =  0;                  while  (values.hasNext())  {                      sum  +=  values.next().get();                  }                  output.collect(key,  new  IntWritable(sum));              }          }              public  static  void  main(String[]  args)  throws  Exception  {              JobConf  conf  =  new  JobConf(WordCount.class);              conf.setJobName("wordcount");                  conf.setOutputKeyClass(Text.class);              conf.setOutputValueClass(IntWritable.class);                  conf.setMapperClass(Map.class);              conf.setCombinerClass(Reduce.class);              conf.setReducerClass(Reduce.class);                  conf.setInputFormat(TextInputFormat.class);              conf.setOutputFormat(TextOutputFormat.class);                  FileInputFormat.setInputPaths(conf,  new  Path(args[0]));              FileOutputFormat.setOutputPath(conf,  new  Path(args[1]));                  JobClient.runJob(conf);          }   }    
  • 7. What  do  we  actually  want  to  do?   Documents   (lines)   Tokenize   GroupBy   (token)   Count   Word   count  
  • 8. Word  Count  in  Scalding   •  asd   package  com.sanoma.cda.examples   import  com.twitter.scalding._       class  WordCount1(args:  Args)  extends  Job(args)  {      TextLine(args("input"))          .flatMap('line  -­‐>  'word)  {  line:  String  =>  line.split("s+")  }          .groupBy('word)  {  _.size  }          .write(Tsv(args("output")))   }   There  is  scald.rb  to  get  you  started  (get  it  from  Github  project)     Building  and  running  a  fat  jar  (for  local,  include  hadoop,  for  cluster  mark  it  “provided”):   > sbt assembly > java -jar target/scala-2.10/scalding_talk-assembly-0.1.jar com.sanoma.cda.examples.WordCount1 --local --input data/11.txt.utf-8 --output wc.txt > hadoop jar job-jars/scalding_talk-assembly-0.1.jar --Dmapred.reduce.tasks=70 com.sanoma.cda.examples.WordCount1 --hdfs --input /data/AliceInWonderland --output /user/Alice_wc   the and to a of she said in it was you I as that Alice …   Alice, Alice. Alice; Alice's Alice: (Alice Alice! Alice,)  1664    1172    780    773    662    596    484    416    401    356    329    301    260    246    226    221    76    54    16    11    7    4    3    2  
  • 9. Word  Count  in  Scalding   •  asd   package  com.sanoma.cda.examples   import  com.twitter.scalding._       class  WordCount2(args:  Args)  extends  Job(args)  {      TextLine(args("input"))          .flatMap('line  -­‐>  'word)  {  line:  String  =>  tokenize(line)  }          .filter('word)  {  word:  String  =>  word  !=  ""  }          .groupBy('word)  {  _.size  }            .groupAll{  _.sortBy(('size,  'word)).reverse  }  //  this  is  just  for  easy  results          .write(Tsv(args("output")))          def  tokenize(text:  String):  Array[String]  =  {          text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",  "").split("s+")      }   }   the and to a of it she said you in i alice was that as her with at on all  1804    912    801    684    625    541    538    462    429    428    400    385    358    291    272    248    228    224    204    197  
  • 10. Word  count  in  Scalding   Almost  1-­‐to-­‐1  rela)on  between  the   process  and  the  Scalding  code!     UDFs  directly  in  Scala   And  Java  libraries  can  be  used   Documents   (lines)   Tokenize   package  com.sanoma.cda.examples   import  com.twitter.scalding._       class  WordCount2(args:  Args)  extends  Job(args)  {      TextLine(args("input"))          .flatMap('line  -­‐>  'word)  {  tokenize  }          .groupBy('word)  {  _.size  }          .write(Tsv(args("output")))          def  tokenize(text:  String):  Array[String]  =  {          text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",  "").split("s+")      }   }     GroupBy   (token)   Count   Word   count  
  • 11. About  Scalding   •  Started  at  Twiper  –  years  of  produc)on  use   •  Well  tested  and  op)mized  by  different  teams,   including  Twiper,  Concurrent  Inc.,  Etsy,  …   •  Has  very  fast  local  mode  (no  need  to  install   Hadoop  locally)   •  Flow  planner  is  designed  to  be  portable  à  in   future,  the  same  jobs  might  run  on  Storm  cluster   for  example   •  Scala…  very  nice  programming  language  –  YMMV   –  Func)onal  &  object  oriented,  has  REPL  
  • 12. Scalding  Func)ons   •  3  APIs:   –  Fields-­‐based  API  –  easy  to  start  from  here   –  Type-­‐safe  API   –  Matrix  API   •  Field-­‐based  API   –  Map-­‐like  func)ons   •  map,  flatMap,  project,  insert,  filter,  limit…   –  Grouping/reducing  func)ons   •  groupBy,  groupAll   •  .size,  .sum,  .average,  .sizeAveStdev,  .toList,  .max,   sortBy,  .reduce,  .foldLeu,  .pivot,  …   –  Join  Opera)ons   •  joinWithSmaller,  joinWithLarger,  joinWithTiny,  crossWithTiny   •  InnerJoin,  LeuJoin,  RightJoin,  OuterJoin  
  • 13. Scalding  matrix  API   package  com.twitter.scalding.examples   import  com.twitter.scalding._   import  com.twitter.scalding.mathematics.Matrix       /**   *  Loads  a  directed  graph  adjacency  matrix  where  a[i,j]  =  1  if  there  is  an  edge  from  a[i]  to  b[j]   *  and  computes  the  cosine  of  the  angle  between  every  two  pairs  of  vectors   */   class  ComputeCosineJob(args  :  Args)  extends  Job(args)  {      import  Matrix._          val  adjacencyMatrix  =  Tsv(  args("input"),  ('user1,  'user2,  'rel)  )          .read          .toMatrix[Long,Long,Double]('user1,  'user2,  'rel)          //  we  compute  the  L2  normalized  adjacency  graph        val  matL2Norm  =  adjacencyMatrix.rowL2Normalize        //  we  compute  the  innerproduct  of  the  normalized  matrix  with  itself      //  which  is  equivalent  with  computing  cosine:  AA^T  /  ||A||  *  ||A||      val  cosDist  =  matL2Norm  *  matL2Norm.transpose        cosDist.write(Tsv(args("output”)))   }    
  • 15. What  is  a  monoid?   •  Closure   ∀a, b ∈ T : a • b ∈ T •  Associa)vity   ∀a, b, c ∈ T : (a • b)•c = a •(b •c) •  Iden)ty  element   ∃I ∈ T : ∀a ∈ T : I • a = a • I = a Scala  trait:   trait  Monoid[T]  {        def  zero:  T        def  plus(left:  T,  right:  T):  T   }  
  • 16. Examples  of  monoids   •  Numbers,  String,  list,  set,  map   •  Algorithms:     –  Min,  Max   –  Moments  (count,  mean,  std,  …)   –  Approximate  histograms,  quan)les   –  Approximate  data  structures   •  Bloom  Filter,  CountMinSketch,  HyperLogLog   –  Stochas)c  gradient  descent  
  • 17. What’s  the  point?   a0  +  a1  +  a2  +  a3  +  a4  +  a5  +  a6  +  a7     (a0  +  a1)  +  (a2  +  a3)  +  (a4  +  a5)  +  (a6  +  a7)     (  (a0  +  a1)  +  (a2  +  a3)  )  +  (  (a4  +  a5)  +  (a6  +  a7)  )     (  (  (a0  +  a1)  +  (a2  +  a3)  )  +  (  (a4  +  a5)  +  (a6  +  a7)  )  )   à  Parallelism  
  • 18. What’s  the  point?   a0  +  a1  +  a2  +  a3  +  a4  +  a5  +  a6  +  a7                    (a0  +  a1  +  a2  +  a3  +  a4  +  a5  +  a6  +  a7)  +  a8     à  Incremental  aggrega)on  
  • 19. What’s  the  point?   •  Easily  unit  testable  opera)ons   •  Simple  aggrega)on  code   à  Beper  quality  
  • 20. Word  Count  with  Map  Monoid   package  com.sanoma.cda.examples   import  com.twitter.scalding._   import  com.twitter.algebird.Operators._       class  WordCount3(args:  Args)  extends  Job(args)  {      TextLine(args("input"))          .flatMap('line  -­‐>  'word)  {  tokenize  }          .map('word  -­‐>  'word)  {  w:  String  =>  Map[String,  Int](w  -­‐>  1)  }          .groupAll{  _.sum[Map[String,  Int]]('word)  }            //  We  could  save  the  map  here,  but  if  we  want  similar  output  as  in  previous...          .flatMap('word  -­‐>  ('word,  'size))  {  words:  Map[String,  Int]  =>  words.toList  }          .groupAll{  _.sortBy(('size,  'word)).reverse  }  //  this  is  just  for  easy  results          .write(Tsv(args("output")))          def  tokenize(text:  String):  Array[String]  =  {          text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",  "").split("s+").filter(  _  !=  "")      }   }   the and to a of it she said you in i alice was that as her with at on all  1804    912    801    684    625    541    538    462    429    428    400    385    358    291    272    248    228    224    204    197  
  • 21. Top  Words  with  CMS   •  asd   package  com.sanoma.cda.examples   import  com.twitter.scalding._   import  com.twitter.algebird._       class  WordCount5(args:  Args)  extends  Job(args)  {      implicit  def  utf8(s:  String):  Array[Byte]  =  com.twitter.bijection.Injection.utf8(s)      implicit  val  cmsm  =  new  SketchMapMonoid[String,  Long](128,  6,  0,  20)  //  top  20      type  ApproxMap  =  SketchMap[String,  Long]          TextLine(args("input"))          .flatMap('line  -­‐>  'word)  {  tokenize  }          .map('word  -­‐>  'word)  {  w:  String  =>  cmsm.create((w,  1L))  }          .groupAll{  _.sum[ApproxMap]('word)  }          .flatMap('word  -­‐>  ('word,  'size))  {  words:  ApproxMap  =>  words.heavyHitters  }          .write(Tsv(args("output")))          def  tokenize(text:  String):  Array[String]  =  {          text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",  "").split("s+").filter(  _  !=  "")      }   }   the and to a of she it said you in i alice at was that her with as not be  1859    972    867    748    711    636    619    579    504    495    456    431    407    394    342    341    338    337    290    286  
  • 22. Start  using  Scalding  now!  :-­‐)   GitHub:   hpps://github.com/twiper/scalding     Tutorials:   hpps://github.com/twiper/scalding/tree/ develop/tutorial      
  • 23. Thanks!       Hugo  Gävert   Sanoma