SlideShare a Scribd company logo
Scalding
Hadoop Word Count
in < 70 lines of code




                  Konrad 'ktoso' Malawski
                 JARCamp #3 12.04.2013
Scalding
Hadoop Word Count

 in   4 lines of code


                   Konrad 'ktoso' Malawski
                  JARCamp #3 12.04.2013
softwaremill.com / java.pl / sckrk.com / geecon.org / krakowscala.pl / gdgkrakow.pl
Agenda
Agenda
Why Scalding? (10%)
Agenda
Why Scalding? (10%)
       +
Agenda
Why Scalding? (10%)
       +
Hadoop Basics (20%)
Agenda
Why Scalding? (10%)
       +
Hadoop Basics (20%)
       +
Agenda
 Why Scalding? (10%)
          +
 Hadoop Basics (20%)
          +
Enter Cascading (40%)
Agenda
 Why Scalding? (10%)
          +
 Hadoop Basics (20%)
          +
Enter Cascading (40%)
          +
Agenda
 Why Scalding? (10%)
           +
 Hadoop Basics (20%)
           +
Enter Cascading (40%)
           +
 Hello Scalding (30%)
Agenda
 Why Scalding? (10%)
           +
 Hadoop Basics (20%)
           +
Enter Cascading (40%)
           +
 Hello Scalding (30%)
           =
Agenda
 Why Scalding? (10%)
           +
 Hadoop Basics (20%)
           +
Enter Cascading (40%)
           +
 Hello Scalding (30%)
           =
         100%
Why Scalding?
 Word Count in Types


type Word = String
type Count = Int

String => Map[Word, Count]
Why Scalding?
 Word Count in Scala
Why Scalding?
                Word Count in Scala

val text = "a a a b b"
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map { a => a._1 -> a._2.map(_._2).sum }
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map { a => a._1 -> a._2.map(_._2).sum }



wordCount(text) should equal (Map("a" -> 3), ("b" -> 2)))
Stuff > Memory
Scala collections... fun but, memory bound!


val text = "so many words... waaah! ..."


  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                            in Memory
val text = "so many words... waaah! ..."


  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                            in Memory
val text = "so many words... waaah! ..."

                         in Memory
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                            in Memory
val text = "so many words... waaah! ..."

                         in Memory
  text
                                in Memory
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                               in Memory
val text = "so many words... waaah! ..."

                           in Memory
  text
                                in Memory
    .split(" ")
    .map(a => (a, 1))              in Memory
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                               in Memory
val text = "so many words... waaah! ..."

                           in Memory
  text
                                in Memory
    .split(" ")
    .map(a => (a, 1))              in Memory
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))

                                       in Memory
Apache Hadoop (HDFS + MR)
    http://hadoop.apache.org/
Why Scalding?
                             Word Count in Hadoop MR



package org.myorg;

import   org.apache.hadoop.fs.Path;
import   org.apache.hadoop.io.IntWritable;
import   org.apache.hadoop.io.LongWritable;
import   org.apache.hadoop.io.Text;
import   org.apache.hadoop.mapred.*;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

public class WordCount {

    public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro
IOException {
            String line = value.toString();
            StringTokenizer tokenizer = new StringTokenizer(line);
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken());
                output.collect(word, one);
private final static IntWritable one = new IntWritable(1);




                              Why Scalding?
        private Text word = new Text();

        public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro
IOException {
            String line = value.toString();
            StringTokenizer tokenizer = new StringTokenizer(line);
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken());

                           Word Count in Hadoop MR
                output.collect(word, one);
            }
        }
    }

    public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
        public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter
reporter) throws IOException {
            int sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }
            output.collect(key, new IntWritable(sum));
        }
    }

    public static void main(String[] args) throws Exception {
        JobConf conf = new JobConf(WordCount.class);
        conf.setJobName("wordcount");

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);

        conf.setMapperClass(Map.class);
        conf.setCombinerClass(Reduce.class);
        conf.setReducerClass(Reduce.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        JobClient.runJob(conf);
    }
}
Trivia: How old is Hadoop?
Scalding - Hadoop Word Count in LESS than 70 lines of code
Scalding - Hadoop Word Count in LESS than 70 lines of code
Scalding - Hadoop Word Count in LESS than 70 lines of code
Scalding - Hadoop Word Count in LESS than 70 lines of code
Scalding - Hadoop Word Count in LESS than 70 lines of code
Scalding - Hadoop Word Count in LESS than 70 lines of code
Scalding - Hadoop Word Count in LESS than 70 lines of code
Scalding - Hadoop Word Count in LESS than 70 lines of code
Cascading
www.cascading.org/
Cascading
www.cascading.org/
Cascading
    is
Cascading
     is
Taps & Pipes
Cascading
     is
Taps & Pipes



        & Sinks
1: Distributed Copy
1: Distributed Copy
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
  .addSource( copyPipe, inTap )
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
  .addSource(copyPipe, inTap)
  .addTailSink(copyPipe, outTap);
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
  .addSource(copyPipe, inTap)
  .addTailSink(copyPipe, outTap);

// run!
flowConnector.connect(flowDef).complete();
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
2: Word Count



String docPath = args[ 0 ];
String wcPath = args[ 1 ];

Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
2: Word Count


  String docPath = args[ 0 ];
  String wcPath = args[ 1 ];

  Properties properties = new Properties();
  AppProps.setApplicationJarClass( props, Main.class );
  HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

 // create source and sink taps
 Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
 Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

  // specify a regex operation to split the "document" text lines into a
ken stream
2: Word Count

  String docPath = args[ 0 ];
  String wcPath = args[ 1 ];

  Properties properties = new Properties();
  AppProps.setApplicationJarClass( props, Main.class );
  HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

 // create source and sink taps
 Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
 Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

  // specify a regex operation to split the "document" text lines into a
ken stream
  Fields token = new Fields( "token" );
  Fields text = new Fields( "text" );
  RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [
),.]" );
2: Word Count
String docPath = args[ 0 ];
String wcPath = args[ 1 ];

Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
String wcPath = args[ 1 ];


            2: Word Count
            2: Word Count
Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
Fields token = new Fields( "token" );


                2: Word Count
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter =
                new RegexSplitGenerator( token, "[ [](),.]" );

    // only returns "token"
    Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", docPipe );
    wcPipe = new GroupBy( wcPipe, token );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( docPipe, docTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wc.dot" );
    wcFlow.complete();
    }
}
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );


                2: Word Count
                How it's made
    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", docPipe );
    wcPipe = new GroupBy( wcPipe, token );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( docPipe, docTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wc.dot" );
    wcFlow.complete();
    }
}
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );


                2: Word Count
                How it's made
    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", docPipe );
    wcPipe = new GroupBy( wcPipe, token );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( docPipe, docTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wc.dot" );
    wcFlow.complete();
    }
                    Graph representation of jobs!
}
2: Word Count
How it's made




http://www.cascading.org/2012/07/09/cascading-for-the-impatient-part-2/
How it's made
How it's made
val flow = FlowDef
How it's made
val flow = FlowDef

// pseudo code...
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)

// pseudo code...
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)

// pseudo code...
HadoopCluster.execute(jobs)
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)

// pseudo code...
HadoopCluster.execute(jobs)
Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...

// head and tail have same name
FlowDef flowDef = new FlowDef()
  .setName( "debug" )
  .addSource( "assembly", source )
  .addSink( "assembly", sink )
  .addTail( assembly );
Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...

// head and tail have same name
FlowDef flowDef = new FlowDef()
  .setName( "debug" )
  .addSource( "assembly", source )
  .addSink( "assembly", sink )
  .addTail( assembly );


flowDef.setDebugLevel( DebugLevel.NONE );
Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...

// head and tail have same name
FlowDef flowDef = new FlowDef()
  .setName( "debug" )
  .addSource( "assembly", source )
  .addSink( "assembly", sink )
  .addTail( assembly );


flowDef.setDebugLevel( DebugLevel.NONE );

                     flowConnector will NOT create the Debug pipe!
Scalding
     =
     +


   Twitter Scalding
github.com/twitter/scalding
Scalding API
map
map
Scala:
val data = 1 :: 2 :: 3 :: Nil
map
Scala:
val data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
map
Scala:
val data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }

                                   // Int => Int
map
Scala:
val data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }

                                   // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                     // Int => Int


Scalding:
  IterableSource(data)
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                        // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                        // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


                                         // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                             // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


                         available in Pipe   // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                             // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


     stays in Pipe       available in Pipe   // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                          // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


                      must choose type!   // Int => Int
mapTo
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
                                   // Int => Int
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
                                   // Int => Int
         release reference
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
                                   // Int => Int
         release reference
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                     // Int => Int
            release reference

Scalding:
  IterableSource(data)
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                           // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                           // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }


                                           // Int => Int
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                                    // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }


                            doubled stays in Pipe   // Int => Int
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                                    // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }


  number is removed         doubled stays in Pipe   // Int => Int
flatMap
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]

numbers             // List[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
    .flatMap('line -> 'word) { _.split(",") }    // like List[String]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
    .flatMap('line -> 'word) { _.split(",") }    // like List[String]
    .map('word -> 'number) { _.toInt }           // like List[Int]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
    .flatMap('line -> 'word) { _.split(",") }    // like List[String]
    .map('word -> 'number) { _.toInt }           // like List[Int]

                     MR map outside
flatMap
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}

numbers             // List[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                               // like List[String]
    .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                               // like List[String]
    .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
                                                  // like List[Int]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                               // like List[String]
    .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
                                                  // like List[Int]
                          map inside Scala
groupBy
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil     // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]

groups(true) should equal (List(1, 2))
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]

groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]

groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

 val groups = data groupBy { _ < 10 }

 groups         // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

 val groups = data groupBy { _ < 10 }

 groups         // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

 val groups = data groupBy { _ < 10 }

 groups         // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.size('size) }
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil         // List[Int]

 val groups = data groupBy { _ < 10 }

 groups                // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.size('size) }

 groups all with == value
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil         // List[Int]

 val groups = data groupBy { _ < 10 }

 groups                // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.size('size) }

 groups all with == value                     => 'size
groupBy


Scalding:
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.sum('total) }
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.sum('total) }

                              'total = [3, 74]
Scalding API
Scalding API
  project / discard
Scalding API
  project / discard
    map / mapTo
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
      rename
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
      rename
       filter
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
      rename
        filter
       unique
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
                  debug
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
                  debug

          Group operations
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
                  debug

          Group operations

                 joins
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {

  val input = Tsv(args("input"))
  val output = Tsv(args("output"))
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {

    val input = Tsv(args("input"))
    val output = Tsv(args("output"))

    input.read.write(output)

}
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {

    val input = Tsv(args("input"))
    val output = Tsv(args("output"))

    input.read.write(output)

}




                      The End.
Main Class - "Runner"

import org.apache.hadoop.util.ToolRunner
import com.twitter.scalding

object ScaldingJobRunner extends App {

    ToolRunner.run(new Configuration, new scalding.Tool, args)

}
Main Class - "Runner"

import org.apache.hadoop.util.ToolRunner
import com.twitter.scalding

object ScaldingJobRunner extends App {          from App

    ToolRunner.run(new Configuration, new scalding.Tool, args)

}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {




}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")




}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)




}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }



    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { group => group.size('count) }


    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { group => group.size }


    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { _.size }


    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { _.size }
      .write(Tsv(outputFile))

    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
 class WordCountJob(args: Args) extends Job(args) {

     val inputFile = args("input")
     val outputFile = args("output")




4{
     TextLine(inputFile)
       .flatMap('line -> 'word) { line: String => tokenize(line) }
       .groupBy('word) { _.size }
       .write(Tsv(outputFile))

     def tokenize(text: String): Array[String] = implemented
 }
Word Count in Scalding
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph

=> pl.project13.scala.oculus.job.WordCountJob0.dot
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph

=> pl.project13.scala.oculus.job.WordCountJob0.dot




M
A
P
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph

=> pl.project13.scala.oculus.job.WordCountJob0.dot




M
A
P
R
E
D
Word Count in Scalding
TextLine(inputFile)
  .flatMap('line -> 'word) { line: String => tokenize(line) }
  .groupBy('word) { _.size('count) }
  .write(Tsv(outputFile))
Word Count in Scalding
TextLine(inputFile)
  .flatMap('line -> 'word) { line: String => tokenize(line) }
  .groupBy('word) { _.size('count) }
  .write(Tsv(outputFile))
Word Count in Scalding
TextLine(inputFile)
  .flatMap('line -> 'word) { line: String => tokenize(line) }
  .groupBy('word) { _.size('count) }
  .write(Tsv(outputFile))
Why Scalding?
Why Scalding?


 Hadoop inside
Why Scalding?


    Hadoop inside
Cascading abstractions
Why Scalding?


    Hadoop inside
Cascading abstractions
  Scala conciseness
Ask Stuff!

      Dzięki!
      Thanks!
     ありがとう!


Konrad Malawski @ java.pl
t: ktosopl / g: ktoso
b: blog.project13.pl

More Related Content

PPTX
Should I Use Scalding or Scoobi or Scrunch?
PDF
Introduction to Scalding and Monoids
PDF
Scalding for Hadoop
PPTX
Scoobi - Scala for Startups
KEY
Scalding: Twitter's Scala DSL for Hadoop/Cascading
PPTX
Writing Hadoop Jobs in Scala using Scalding
PDF
Scalding - the not-so-basics @ ScalaDays 2014
PPTX
Scalding: Reaching Efficient MapReduce
Should I Use Scalding or Scoobi or Scrunch?
Introduction to Scalding and Monoids
Scalding for Hadoop
Scoobi - Scala for Startups
Scalding: Twitter's Scala DSL for Hadoop/Cascading
Writing Hadoop Jobs in Scala using Scalding
Scalding - the not-so-basics @ ScalaDays 2014
Scalding: Reaching Efficient MapReduce

What's hot (20)

PDF
PPTX
Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant)
PDF
A deeper-understanding-of-spark-internals
PDF
Meet scala
PDF
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
PDF
Cascading Through Hadoop for the Boulder JUG
PPTX
Modern technologies in data science
PDF
Introduction to Hadoop and MapReduce
PDF
Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...
PDF
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
PPTX
Scalding Presentation
PDF
Spark Schema For Free with David Szakallas
PDF
Requery overview
PDF
Polyglot Persistence
PPT
Hive - SerDe and LazySerde
PDF
Cassandra 3.0 - JSON at scale - StampedeCon 2015
PPTX
Introduction to MapReduce and Hadoop
PPS
Making an Object System with Tcl 8.5
PPTX
2017 02-07 - elastic & spark. building a search geo locator
PDF
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant)
A deeper-understanding-of-spark-internals
Meet scala
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Cascading Through Hadoop for the Boulder JUG
Modern technologies in data science
Introduction to Hadoop and MapReduce
Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
Scalding Presentation
Spark Schema For Free with David Szakallas
Requery overview
Polyglot Persistence
Hive - SerDe and LazySerde
Cassandra 3.0 - JSON at scale - StampedeCon 2015
Introduction to MapReduce and Hadoop
Making an Object System with Tcl 8.5
2017 02-07 - elastic & spark. building a search geo locator
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
Ad

Viewers also liked (20)

PDF
Spark at Twitter - Seattle Spark Meetup, April 2014
PPTX
Unit testing pig
PPTX
Practical Pig and PigUnit (Michael Noll, Verisign)
PPTX
How LinkedIn Uses Scalding for Data Driven Product Development
PDF
Android at-xsolve
PDF
Scala dsls-dissecting-and-implementing-rogue
PDF
TDD drogą do oświecenia w Scali
PDF
Git tak po prostu (SFI version)
PDF
JavaOne 2013: Java 8 - The Good Parts
PDF
Open soucerers - jak zacząć swoją przygodę z open source
PDF
Android my Scala @ JFokus 2013
PDF
HBase RowKey design for Akka Persistence
PDF
Need for Async: Hot pursuit for scalable applications
PDF
Ebay legacy-code-retreat
PPTX
Hadoop Summit 2012 | Optimizing MapReduce Job Performance
PDF
KrakDroid: Scala on Android
PDF
[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)
PDF
100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK
PDF
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
PDF
Disrupt 2 Grow - Devoxx 2013
Spark at Twitter - Seattle Spark Meetup, April 2014
Unit testing pig
Practical Pig and PigUnit (Michael Noll, Verisign)
How LinkedIn Uses Scalding for Data Driven Product Development
Android at-xsolve
Scala dsls-dissecting-and-implementing-rogue
TDD drogą do oświecenia w Scali
Git tak po prostu (SFI version)
JavaOne 2013: Java 8 - The Good Parts
Open soucerers - jak zacząć swoją przygodę z open source
Android my Scala @ JFokus 2013
HBase RowKey design for Akka Persistence
Need for Async: Hot pursuit for scalable applications
Ebay legacy-code-retreat
Hadoop Summit 2012 | Optimizing MapReduce Job Performance
KrakDroid: Scala on Android
[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)
100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
Disrupt 2 Grow - Devoxx 2013
Ad

Similar to Scalding - Hadoop Word Count in LESS than 70 lines of code (20)

PDF
Solving real world problems with Hadoop
PDF
Introducción a hadoop
PDF
Why hadoop map reduce needs scala, an introduction to scoobi and scalding
PPTX
Scalable and Flexible Machine Learning With Scala @ LinkedIn
PDF
Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014
PDF
Beyond Map/Reduce: Getting Creative With Parallel Processing
PPTX
Hadoop and Mapreduce for .NET User Group
PDF
Mapreduce by examples
PPTX
Presentation on functional data mining at the IGT Cloud meet up at eBay Netanya
KEY
Dachis group pigout_101
PPTX
Big Data & Analytics MapReduce/Hadoop – A programmer’s perspective
 
PPTX
Intro to Big Data using Hadoop
PPTX
Hadoop ecosystem
PDF
Hadoop ecosystem
PDF
Large Scale Data Processing & Storage
PDF
OSDC.fr 2012 :: Cascalog : progammation logique pour Hadoop
PPTX
EMC2, Владимир Суворов
PDF
Hw09 Hadoop + Clojure
Solving real world problems with Hadoop
Introducción a hadoop
Why hadoop map reduce needs scala, an introduction to scoobi and scalding
Scalable and Flexible Machine Learning With Scala @ LinkedIn
Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014
Beyond Map/Reduce: Getting Creative With Parallel Processing
Hadoop and Mapreduce for .NET User Group
Mapreduce by examples
Presentation on functional data mining at the IGT Cloud meet up at eBay Netanya
Dachis group pigout_101
Big Data & Analytics MapReduce/Hadoop – A programmer’s perspective
 
Intro to Big Data using Hadoop
Hadoop ecosystem
Hadoop ecosystem
Large Scale Data Processing & Storage
OSDC.fr 2012 :: Cascalog : progammation logique pour Hadoop
EMC2, Владимир Суворов
Hw09 Hadoop + Clojure

More from Konrad Malawski (20)

PDF
Networks and Types - the Future of Akka @ ScalaDays NYC 2018
PDF
Akka Typed (quick talk) - JFokus 2018
PDF
ScalaSwarm 2017 Keynote: Tough this be madness yet theres method in't
PDF
State of Akka 2017 - The best is yet to come
PDF
Building a Reactive System with Akka - Workshop @ O'Reilly SAConf NYC
PDF
Akka-chan's Survival Guide for the Streaming World
PDF
Reactive integrations with Akka Streams
PDF
Not Only Streams for Akademia JLabs
PDF
Reactive Streams, j.u.concurrent & Beyond!
PDF
End to End Akka Streams / Reactive Streams - from Business to Socket
PDF
The Cloud-natives are RESTless @ JavaOne
PDF
Akka Streams in Action @ ScalaDays Berlin 2016
PDF
Krakow communities @ 2016
PDF
The things we don't see – stories of Software, Scala and Akka
PDF
[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...
PDF
Zen of Akka
PDF
How Reactive Streams & Akka Streams change the JVM Ecosystem
PDF
The Need for Async @ ScalaWorld
PDF
Reactive Stream Processing with Akka Streams
PDF
Reactive Streams / Akka Streams - GeeCON Prague 2014
Networks and Types - the Future of Akka @ ScalaDays NYC 2018
Akka Typed (quick talk) - JFokus 2018
ScalaSwarm 2017 Keynote: Tough this be madness yet theres method in't
State of Akka 2017 - The best is yet to come
Building a Reactive System with Akka - Workshop @ O'Reilly SAConf NYC
Akka-chan's Survival Guide for the Streaming World
Reactive integrations with Akka Streams
Not Only Streams for Akademia JLabs
Reactive Streams, j.u.concurrent & Beyond!
End to End Akka Streams / Reactive Streams - from Business to Socket
The Cloud-natives are RESTless @ JavaOne
Akka Streams in Action @ ScalaDays Berlin 2016
Krakow communities @ 2016
The things we don't see – stories of Software, Scala and Akka
[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...
Zen of Akka
How Reactive Streams & Akka Streams change the JVM Ecosystem
The Need for Async @ ScalaWorld
Reactive Stream Processing with Akka Streams
Reactive Streams / Akka Streams - GeeCON Prague 2014

Recently uploaded (20)

PPTX
Effective Security Operations Center (SOC) A Modern, Strategic, and Threat-In...
DOCX
The AUB Centre for AI in Media Proposal.docx
PDF
Bridging biosciences and deep learning for revolutionary discoveries: a compr...
PDF
Per capita expenditure prediction using model stacking based on satellite ima...
PDF
Empathic Computing: Creating Shared Understanding
PDF
Machine learning based COVID-19 study performance prediction
PDF
Mobile App Security Testing_ A Comprehensive Guide.pdf
PDF
Chapter 3 Spatial Domain Image Processing.pdf
PDF
NewMind AI Monthly Chronicles - July 2025
PDF
Dropbox Q2 2025 Financial Results & Investor Presentation
PPTX
PA Analog/Digital System: The Backbone of Modern Surveillance and Communication
PPTX
Cloud computing and distributed systems.
PDF
Peak of Data & AI Encore- AI for Metadata and Smarter Workflows
PDF
Diabetes mellitus diagnosis method based random forest with bat algorithm
PDF
Reach Out and Touch Someone: Haptics and Empathic Computing
PDF
Unlocking AI with Model Context Protocol (MCP)
PDF
Modernizing your data center with Dell and AMD
PDF
cuic standard and advanced reporting.pdf
PDF
How UI/UX Design Impacts User Retention in Mobile Apps.pdf
PDF
Blue Purple Modern Animated Computer Science Presentation.pdf.pdf
Effective Security Operations Center (SOC) A Modern, Strategic, and Threat-In...
The AUB Centre for AI in Media Proposal.docx
Bridging biosciences and deep learning for revolutionary discoveries: a compr...
Per capita expenditure prediction using model stacking based on satellite ima...
Empathic Computing: Creating Shared Understanding
Machine learning based COVID-19 study performance prediction
Mobile App Security Testing_ A Comprehensive Guide.pdf
Chapter 3 Spatial Domain Image Processing.pdf
NewMind AI Monthly Chronicles - July 2025
Dropbox Q2 2025 Financial Results & Investor Presentation
PA Analog/Digital System: The Backbone of Modern Surveillance and Communication
Cloud computing and distributed systems.
Peak of Data & AI Encore- AI for Metadata and Smarter Workflows
Diabetes mellitus diagnosis method based random forest with bat algorithm
Reach Out and Touch Someone: Haptics and Empathic Computing
Unlocking AI with Model Context Protocol (MCP)
Modernizing your data center with Dell and AMD
cuic standard and advanced reporting.pdf
How UI/UX Design Impacts User Retention in Mobile Apps.pdf
Blue Purple Modern Animated Computer Science Presentation.pdf.pdf

Scalding - Hadoop Word Count in LESS than 70 lines of code

  • 1. Scalding Hadoop Word Count in < 70 lines of code Konrad 'ktoso' Malawski JARCamp #3 12.04.2013
  • 2. Scalding Hadoop Word Count in 4 lines of code Konrad 'ktoso' Malawski JARCamp #3 12.04.2013
  • 3. softwaremill.com / java.pl / sckrk.com / geecon.org / krakowscala.pl / gdgkrakow.pl
  • 7. Agenda Why Scalding? (10%) + Hadoop Basics (20%)
  • 8. Agenda Why Scalding? (10%) + Hadoop Basics (20%) +
  • 9. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%)
  • 10. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) +
  • 11. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) + Hello Scalding (30%)
  • 12. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) + Hello Scalding (30%) =
  • 13. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) + Hello Scalding (30%) = 100%
  • 14. Why Scalding? Word Count in Types type Word = String type Count = Int String => Map[Word, Count]
  • 15. Why Scalding? Word Count in Scala
  • 16. Why Scalding? Word Count in Scala val text = "a a a b b"
  • 17. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] =
  • 18. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text
  • 19. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ")
  • 20. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1))
  • 21. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1)
  • 22. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map { a => a._1 -> a._2.map(_._2).sum }
  • 23. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map { a => a._1 -> a._2.map(_._2).sum } wordCount(text) should equal (Map("a" -> 3), ("b" -> 2)))
  • 24. Stuff > Memory Scala collections... fun but, memory bound! val text = "so many words... waaah! ..." text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 25. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 26. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 27. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text in Memory .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 28. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text in Memory .split(" ") .map(a => (a, 1)) in Memory .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 29. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text in Memory .split(" ") .map(a => (a, 1)) in Memory .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum)) in Memory
  • 30. Apache Hadoop (HDFS + MR) http://hadoop.apache.org/
  • 31. Why Scalding? Word Count in Hadoop MR package org.myorg; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import java.io.IOException; import java.util.Iterator; import java.util.StringTokenizer; public class WordCount { public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(word, one);
  • 32. private final static IntWritable one = new IntWritable(1); Why Scalding? private Text word = new Text(); public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); Word Count in Hadoop MR output.collect(word, one); } } } public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } output.collect(key, new IntWritable(sum)); } } public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WordCount.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); } }
  • 33. Trivia: How old is Hadoop?
  • 44. Cascading is
  • 45. Cascading is Taps & Pipes
  • 46. Cascading is Taps & Pipes & Sinks
  • 49. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
  • 50. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
  • 51. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy");
  • 52. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef()
  • 53. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef() .addSource( copyPipe, inTap )
  • 54. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap);
  • 55. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); // run! flowConnector.connect(flowDef).complete();
  • 56. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 57. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 58. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 59. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 60. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 61. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 62. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
  • 63. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); // specify a regex operation to split the "document" text lines into a ken stream
  • 64. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); // specify a regex operation to split the "document" text lines into a ken stream Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [ ),.]" );
  • 65. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
  • 66. String wcPath = args[ 1 ]; 2: Word Count 2: Word Count Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" )
  • 67. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 68. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 69. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 70. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 71. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 72. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 73. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 74. Fields token = new Fields( "token" ); 2: Word Count Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow Flow wcFlow = flowConnector.connect( flowDef ); wcFlow.writeDOT( "dot/wc.dot" ); wcFlow.complete(); } }
  • 75. Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); 2: Word Count How it's made // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow Flow wcFlow = flowConnector.connect( flowDef ); wcFlow.writeDOT( "dot/wc.dot" ); wcFlow.complete(); } }
  • 76. Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); 2: Word Count How it's made // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow Flow wcFlow = flowConnector.connect( flowDef ); wcFlow.writeDOT( "dot/wc.dot" ); wcFlow.complete(); } Graph representation of jobs! }
  • 77. 2: Word Count How it's made http://www.cascading.org/2012/07/09/cascading-for-the-impatient-part-2/
  • 79. How it's made val flow = FlowDef
  • 80. How it's made val flow = FlowDef // pseudo code...
  • 81. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow)
  • 82. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow) // pseudo code...
  • 83. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow) // pseudo code... HadoopCluster.execute(jobs)
  • 84. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow) // pseudo code... HadoopCluster.execute(jobs)
  • 85. Cascading tips Pipe assembly = new Pipe( "assembly" ); assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() ); // ... // head and tail have same name FlowDef flowDef = new FlowDef() .setName( "debug" ) .addSource( "assembly", source ) .addSink( "assembly", sink ) .addTail( assembly );
  • 86. Cascading tips Pipe assembly = new Pipe( "assembly" ); assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() ); // ... // head and tail have same name FlowDef flowDef = new FlowDef() .setName( "debug" ) .addSource( "assembly", source ) .addSink( "assembly", sink ) .addTail( assembly ); flowDef.setDebugLevel( DebugLevel.NONE );
  • 87. Cascading tips Pipe assembly = new Pipe( "assembly" ); assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() ); // ... // head and tail have same name FlowDef flowDef = new FlowDef() .setName( "debug" ) .addSource( "assembly", source ) .addSink( "assembly", sink ) .addTail( assembly ); flowDef.setDebugLevel( DebugLevel.NONE ); flowConnector will NOT create the Debug pipe!
  • 88. Scalding = + Twitter Scalding github.com/twitter/scalding
  • 90. map
  • 91. map Scala: val data = 1 :: 2 :: 3 :: Nil
  • 92. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 }
  • 93. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int
  • 94. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int
  • 95. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data)
  • 96. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 }
  • 97. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } // Int => Int
  • 98. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } available in Pipe // Int => Int
  • 99. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } stays in Pipe available in Pipe // Int => Int
  • 100. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } must choose type! // Int => Int
  • 101. mapTo
  • 102. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil
  • 103. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 }
  • 104. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null
  • 105. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int
  • 106. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference
  • 107. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference
  • 108. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data)
  • 109. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 }
  • 110. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 } // Int => Int
  • 111. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 } doubled stays in Pipe // Int => Int
  • 112. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 } number is removed doubled stays in Pipe // Int => Int
  • 114. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String]
  • 115. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String
  • 116. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String]
  • 117. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int]
  • 118. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int]
  • 119. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 120. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 121. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String]
  • 122. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String]
  • 123. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String] .map('word -> 'number) { _.toInt } // like List[Int]
  • 124. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String] .map('word -> 'number) { _.toInt } // like List[Int] MR map outside
  • 126. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String]
  • 127. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String
  • 128. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int]
  • 129. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] }
  • 130. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int]
  • 131. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 132. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 133. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String]
  • 134. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
  • 135. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) } // like List[Int]
  • 136. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) } // like List[Int] map inside Scala
  • 138. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
  • 139. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 }
  • 140. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int]
  • 141. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2))
  • 142. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42))
  • 143. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42))
  • 144. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num)
  • 145. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 }
  • 146. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.size('size) }
  • 147. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.size('size) } groups all with == value
  • 148. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.size('size) } groups all with == value => 'size
  • 151. groupBy Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 }
  • 152. groupBy Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.sum('total) }
  • 153. groupBy Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.sum('total) } 'total = [3, 74]
  • 155. Scalding API project / discard
  • 156. Scalding API project / discard map / mapTo
  • 157. Scalding API project / discard map / mapTo flatMap / flatMapTo
  • 158. Scalding API project / discard map / mapTo flatMap / flatMapTo rename
  • 159. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter
  • 160. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique
  • 161. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle
  • 162. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit
  • 163. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit debug
  • 164. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit debug Group operations
  • 165. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit debug Group operations joins
  • 166. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) {
  • 167. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) { val input = Tsv(args("input")) val output = Tsv(args("output"))
  • 168. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) { val input = Tsv(args("input")) val output = Tsv(args("output")) input.read.write(output) }
  • 169. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) { val input = Tsv(args("input")) val output = Tsv(args("output")) input.read.write(output) } The End.
  • 170. Main Class - "Runner" import org.apache.hadoop.util.ToolRunner import com.twitter.scalding object ScaldingJobRunner extends App { ToolRunner.run(new Configuration, new scalding.Tool, args) }
  • 171. Main Class - "Runner" import org.apache.hadoop.util.ToolRunner import com.twitter.scalding object ScaldingJobRunner extends App { from App ToolRunner.run(new Configuration, new scalding.Tool, args) }
  • 172. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { }
  • 173. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") }
  • 174. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) }
  • 175. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } def tokenize(text: String): Array[String] = implemented }
  • 176. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { group => group.size('count) } def tokenize(text: String): Array[String] = implemented }
  • 177. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { group => group.size } def tokenize(text: String): Array[String] = implemented }
  • 178. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size } def tokenize(text: String): Array[String] = implemented }
  • 179. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size } .write(Tsv(outputFile)) def tokenize(text: String): Array[String] = implemented }
  • 180. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") 4{ TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size } .write(Tsv(outputFile)) def tokenize(text: String): Array[String] = implemented }
  • 181. Word Count in Scalding
  • 182. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph
  • 183. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph => pl.project13.scala.oculus.job.WordCountJob0.dot
  • 184. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph => pl.project13.scala.oculus.job.WordCountJob0.dot M A P
  • 185. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph => pl.project13.scala.oculus.job.WordCountJob0.dot M A P R E D
  • 186. Word Count in Scalding TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size('count) } .write(Tsv(outputFile))
  • 187. Word Count in Scalding TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size('count) } .write(Tsv(outputFile))
  • 188. Word Count in Scalding TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size('count) } .write(Tsv(outputFile))
  • 191. Why Scalding? Hadoop inside Cascading abstractions
  • 192. Why Scalding? Hadoop inside Cascading abstractions Scala conciseness
  • 193. Ask Stuff! Dzięki! Thanks! ありがとう! Konrad Malawski @ java.pl t: ktosopl / g: ktoso b: blog.project13.pl