SlideShare a Scribd company logo
RHadoop, Hadoop for R
RHadoop, R meets Hadoop
Scholarly Activity 05-09 change

  50%

37.5%

  25%

12.5%

   0%

-12.5%

 -25%

-37.5%
         R        SAS       SPSS     S-Plus    Stata
Scholarly Activity 05-09 change

                                                 50%

                                               37.5%

                                                 25%

                                               12.5%
                      Packages
                                                  0%

10000                                          -12.5%

                                                -25%
 1000                                          -37.5%
                                                        R        SAS       SPSS     S-Plus    Stata

  100


   10


    1
        2002   2004   2006       2008   2010
Scholarly Activity 05-09 change

                                                 50%

                                               37.5%

                                                 25%

                                               12.5%
                      Packages
                                                  0%

10000                                          -12.5%

                                                -25%
 1000                                          -37.5%
                                                        R        SAS       SPSS     S-Plus    Stata

  100


   10
                                                    http://r4stats.com/popularity
    1
        2002   2004   2006       2008   2010
RHadoop, R meets Hadoop
David Champagne, CTO
RHadoop, R meets Hadoop
RHadoop, R meets Hadoop
RHadoop, R meets Hadoop
RHadoop, R meets Hadoop
RHadoop, R meets Hadoop
RHadoop, R meets Hadoop
f s
    h d
r
rh d f s


rhb
      ase
rh d f s


rhb
      ase



      rm
  r
rmr
RHadoop, R meets Hadoop
RHadoop, R meets Hadoop
sapply(data, function)

mapreduce(data, map = function)
library(rmr)

mapreduce(…)
Rmr
Rmr




      Java, C++
Rmr




                  Cascading,
      Java, C++
                   Crunch
Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                     Cascading,
        Java, C++
                      Crunch
Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                     Cascading,
        Java, C++
                      Crunch
Expose MR   Hide MR




Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                               Cascading,
        Java, C++
                                Crunch
Expose MR   Hide MR
                               Hive, Pig




Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                               Cascading,
        Java, C++
                                Crunch
Expose MR   Hide MR
                               Hive, Pig




Rmr, Rhipe, Dumbo,
Rmr                            Cascalog,
 Pydoop, Hadoopy           Scalding, Scrunch




                               Cascading,
        Java, C++
                                Crunch
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
condition = function(x) x > 10
condition = function(x) x > 10


out = mapreduce(
condition = function(x) x > 10


out = mapreduce(
        input = input,
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
                 if (condition(v)) keyval(k,v))
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
                 if (condition(v)) keyval(k,v))
x = from.dfs(hdfs.object)

hdfs.object = to.dfs(x)
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;

mapreduce(input =
  mapreduce(input = "pv_users",
    map = function(k, v) keyval(v['userid'], v['gender']),
    reduce = function(uid, genders)
      lapply(unique(genders), function(g) keyval(NULL, g)),
  output = "pv_gender_sum",
  map = function(x, gender) keyval(gender, 1)
  reduce = function(gender,counts)
             keyval(k,sum(unlist(counts)))
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
#!/usr/bin/python
import sys
from math import fabs
from org.apache.pig.scripting import Pig

filename = "student.txt"
k = 4
tolerance = 0.01

MAX_SCORE = 4
MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i!=k-1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'output';
                """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration
    for i in range(k):
        tuple = iter.next()
        centroids[i] = float(str(tuple.get(1)))
        distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
    distance_move = distance_move / k;
    Pig.fs("rmr output")
    print("iteration " + str(iter_num))
    print("average distance moved: " + str(distance_move))
    if distance_move<tolerance:
        sys.stdout.write("k-means converged at centroids: [")
        sys.stdout.write(",".join(str(v) for v in centroids))
        sys.stdout.write("]n")
        converged = True
        break
    last_centroids = centroids[:]
    initial_centroids = ""
    for i in range(k):
        initial_centroids = initial_centroids + str(last_centroids[i])
        if i!=k-1:
            initial_centroids = initial_centroids + ":"
    iter_num += 1

if not converged:
    print("not converge after " + str(iter_num) + " iterations")
    sys.stdout.write("last centroids: [")
    sys.stdout.write(",".join(str(v) for v in last_centroids))
    sys.stdout.write("]n")
import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;


public class FindCentroid extends EvalFunc<Double> {
    double[] centroids;
    public FindCentroid(String initialCentroid) {
        String[] centroidStrings = initialCentroid.split(":");
        centroids = new double[centroidStrings.length];
        for (int i=0;i<centroidStrings.length;i++)
            centroids[i] = Double.parseDouble(centroidStrings[i]);
    }
    @Override
    public Double exec(Tuple input) throws IOException {
        double min_distance = Double.MAX_VALUE;
        double closest_centroid = 0;
        for (double centroid : centroids) {
            double distance = Math.abs(centroid - (Double)input.get(0));
            if (distance < min_distance) {
                min_distance = distance;
                closest_centroid = centroid;
            }
        }
        return closest_centroid;
    }

}
mapreduce(mapreduce(…
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)

equijoin = function(
    left.input, right.input, input,
    output,
    outer,
    map.left, map.right,
    reduce, reduce.all)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)

abstract.job = function(input, output, …) {
   …
   result = mapreduce(input = input,
                      output = output)
   …
   result}
input.format, output.format, format
input.format, output.format, format
combine
input.format, output.format, format
combine
reduce.on.data.frame
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
profiling
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
profiling
verbose
RHadoop, R meets Hadoop
RHadoop, R meets Hadoop
RHadoop, R meets Hadoop
RHADOOP USER
ONE FAT CLUSTER AVE.
HYDROPOWER CITY, OR 0x0000




             RHADOOP@
      REVOLUTIONANALYTICS.COM

More Related Content

PDF
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Al...
PDF
R hive tutorial - apply functions and map reduce
PDF
Spark 4th Meetup Londond - Building a Product with Spark
PDF
Data Manipulation Using R (& dplyr)
PDF
PDF
R hive tutorial - udf, udaf, udtf functions
PDF
4 R Tutorial DPLYR Apply Function
PDF
Next Generation Programming in R
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Al...
R hive tutorial - apply functions and map reduce
Spark 4th Meetup Londond - Building a Product with Spark
Data Manipulation Using R (& dplyr)
R hive tutorial - udf, udaf, udtf functions
4 R Tutorial DPLYR Apply Function
Next Generation Programming in R

What's hot (20)

PDF
Stata cheat sheet analysis
PPT
Hands on data science with r.pptx
PDF
Data manipulation on r
PDF
Polimorfismo cosa?
PDF
Stata cheat sheet: data processing
PDF
Building a Functional Stream in Scala
PPTX
ComputeFest 2012: Intro To R for Physical Sciences
PDF
R + 15 minutes = Hadoop cluster
PDF
Phil Bartie QGIS PLPython
PDF
Morel, a Functional Query Language
PDF
RHive tutorials - Basic functions
PDF
Python for R Users
PPTX
R seminar dplyr package
PDF
Stata cheat sheet: data transformation
PPT
Upgrading To The New Map Reduce API
PDF
Stata cheatsheet transformation
PDF
Stata Programming Cheat Sheet
PDF
Dplyr and Plyr
KEY
Hadoop本 輪読会 1章〜2章
PPTX
Python for R users
Stata cheat sheet analysis
Hands on data science with r.pptx
Data manipulation on r
Polimorfismo cosa?
Stata cheat sheet: data processing
Building a Functional Stream in Scala
ComputeFest 2012: Intro To R for Physical Sciences
R + 15 minutes = Hadoop cluster
Phil Bartie QGIS PLPython
Morel, a Functional Query Language
RHive tutorials - Basic functions
Python for R Users
R seminar dplyr package
Stata cheat sheet: data transformation
Upgrading To The New Map Reduce API
Stata cheatsheet transformation
Stata Programming Cheat Sheet
Dplyr and Plyr
Hadoop本 輪読会 1章〜2章
Python for R users
Ad

Similar to RHadoop, R meets Hadoop (20)

KEY
RHadoop の紹介
PPTX
Hadoop World 2011: The Powerful Marriage of R and Hadoop - David Champagne, R...
PPTX
The Powerful Marriage of Hadoop and R (David Champagne)
PDF
R Reference Card for Data Mining
PPT
Paradigm shifts in wildlife and biodiversity management through machine learning
PPT
Jan 2012 HUG: RHadoop
PDF
Data Munging in R - Chicago R User Group
PPTX
Targeting Risky Credit
PDF
Value extraction from BBVA credit card transactions. IVAN DE PRADO at Big Dat...
PPTX
Nearest Neighbor Customer Insight
PDF
Datasalt - BBVA case study - extracting value from credit card transactions
PDF
Introduction to R for Data Mining
PDF
Analysis update for GENEVA meeting 2011
 
PDF
Data Mining
PPTX
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
PPTX
ACM 2013-02-25
PPTX
Fuzzy association rules pre final
PPTX
R for hadoopers
PDF
Tsukubar8
PPTX
PMM23 Week 3 Lectures
RHadoop の紹介
Hadoop World 2011: The Powerful Marriage of R and Hadoop - David Champagne, R...
The Powerful Marriage of Hadoop and R (David Champagne)
R Reference Card for Data Mining
Paradigm shifts in wildlife and biodiversity management through machine learning
Jan 2012 HUG: RHadoop
Data Munging in R - Chicago R User Group
Targeting Risky Credit
Value extraction from BBVA credit card transactions. IVAN DE PRADO at Big Dat...
Nearest Neighbor Customer Insight
Datasalt - BBVA case study - extracting value from credit card transactions
Introduction to R for Data Mining
Analysis update for GENEVA meeting 2011
 
Data Mining
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
ACM 2013-02-25
Fuzzy association rules pre final
R for hadoopers
Tsukubar8
PMM23 Week 3 Lectures
Ad

More from Revolution Analytics (20)

PPTX
Speeding up R with Parallel Programming in the Cloud
PPTX
Migrating Existing Open Source Machine Learning to Azure
PPTX
R in Minecraft
PPTX
The case for R for AI developers
PPTX
Speed up R with parallel programming in the Cloud
PPTX
The R Ecosystem
PPTX
R Then and Now
PPTX
Predicting Loan Delinquency at One Million Transactions per Second
PPTX
Reproducible Data Science with R
PPTX
The Value of Open Source Communities
PPTX
The R Ecosystem
PPTX
R at Microsoft (useR! 2016)
PPTX
Building a scalable data science platform with R
PPTX
R at Microsoft
PPTX
The Business Economics and Opportunity of Open Source Data Science
PPTX
Taking R Analytics to SQL and the Cloud
PPTX
The Network structure of R packages on CRAN & BioConductor
PPTX
The network structure of cran 2015 07-02 final
PPTX
Simple Reproducibility with the checkpoint package
PPTX
R at Microsoft
Speeding up R with Parallel Programming in the Cloud
Migrating Existing Open Source Machine Learning to Azure
R in Minecraft
The case for R for AI developers
Speed up R with parallel programming in the Cloud
The R Ecosystem
R Then and Now
Predicting Loan Delinquency at One Million Transactions per Second
Reproducible Data Science with R
The Value of Open Source Communities
The R Ecosystem
R at Microsoft (useR! 2016)
Building a scalable data science platform with R
R at Microsoft
The Business Economics and Opportunity of Open Source Data Science
Taking R Analytics to SQL and the Cloud
The Network structure of R packages on CRAN & BioConductor
The network structure of cran 2015 07-02 final
Simple Reproducibility with the checkpoint package
R at Microsoft

Recently uploaded (20)

PDF
Advanced methodologies resolving dimensionality complications for autism neur...
PPT
“AI and Expert System Decision Support & Business Intelligence Systems”
PPTX
PA Analog/Digital System: The Backbone of Modern Surveillance and Communication
PDF
Encapsulation_ Review paper, used for researhc scholars
PDF
Modernizing your data center with Dell and AMD
PDF
cuic standard and advanced reporting.pdf
PDF
NewMind AI Weekly Chronicles - August'25 Week I
PDF
Diabetes mellitus diagnosis method based random forest with bat algorithm
PDF
Shreyas Phanse Resume: Experienced Backend Engineer | Java • Spring Boot • Ka...
PPTX
Digital-Transformation-Roadmap-for-Companies.pptx
PDF
Approach and Philosophy of On baking technology
PPTX
Understanding_Digital_Forensics_Presentation.pptx
PDF
Spectral efficient network and resource selection model in 5G networks
PDF
Mobile App Security Testing_ A Comprehensive Guide.pdf
PDF
Blue Purple Modern Animated Computer Science Presentation.pdf.pdf
PPTX
Big Data Technologies - Introduction.pptx
PDF
KodekX | Application Modernization Development
PDF
Build a system with the filesystem maintained by OSTree @ COSCUP 2025
PDF
NewMind AI Monthly Chronicles - July 2025
PDF
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf
Advanced methodologies resolving dimensionality complications for autism neur...
“AI and Expert System Decision Support & Business Intelligence Systems”
PA Analog/Digital System: The Backbone of Modern Surveillance and Communication
Encapsulation_ Review paper, used for researhc scholars
Modernizing your data center with Dell and AMD
cuic standard and advanced reporting.pdf
NewMind AI Weekly Chronicles - August'25 Week I
Diabetes mellitus diagnosis method based random forest with bat algorithm
Shreyas Phanse Resume: Experienced Backend Engineer | Java • Spring Boot • Ka...
Digital-Transformation-Roadmap-for-Companies.pptx
Approach and Philosophy of On baking technology
Understanding_Digital_Forensics_Presentation.pptx
Spectral efficient network and resource selection model in 5G networks
Mobile App Security Testing_ A Comprehensive Guide.pdf
Blue Purple Modern Animated Computer Science Presentation.pdf.pdf
Big Data Technologies - Introduction.pptx
KodekX | Application Modernization Development
Build a system with the filesystem maintained by OSTree @ COSCUP 2025
NewMind AI Monthly Chronicles - July 2025
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf

RHadoop, R meets Hadoop

  • 3. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% 0% -12.5% -25% -37.5% R SAS SPSS S-Plus Stata
  • 4. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% Packages 0% 10000 -12.5% -25% 1000 -37.5% R SAS SPSS S-Plus Stata 100 10 1 2002 2004 2006 2008 2010
  • 5. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% Packages 0% 10000 -12.5% -25% 1000 -37.5% R SAS SPSS S-Plus Stata 100 10 http://r4stats.com/popularity 1 2002 2004 2006 2008 2010
  • 14. f s h d r
  • 15. rh d f s rhb ase
  • 16. rh d f s rhb ase rm r
  • 17. rmr
  • 22. Rmr
  • 23. Rmr Java, C++
  • 24. Rmr Cascading, Java, C++ Crunch
  • 25. Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 26. Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 27. Expose MR Hide MR Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 28. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 29. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Dumbo, Rmr Cascalog, Pydoop, Hadoopy Scalding, Scrunch Cascading, Java, C++ Crunch
  • 36. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 37. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 38. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 39. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 40. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 41. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 42. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 43. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 45. condition = function(x) x > 10 out = mapreduce(
  • 46. condition = function(x) x > 10 out = mapreduce( input = input,
  • 47. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v)
  • 48. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 49. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 51. INSERT OVERWRITE TABLE pv_gender_sum SELECT pv_users.gender, count (DISTINCT pv_users.userid) FROM pv_users GROUP BY pv_users.gender;
  • 52. INSERT OVERWRITE TABLE pv_gender_sum SELECT pv_users.gender, count (DISTINCT pv_users.userid) FROM pv_users GROUP BY pv_users.gender; mapreduce(input = mapreduce(input = "pv_users", map = function(k, v) keyval(v['userid'], v['gender']), reduce = function(uid, genders) lapply(unique(genders), function(g) keyval(NULL, g)), output = "pv_gender_sum", map = function(x, gender) keyval(gender, 1) reduce = function(gender,counts) keyval(k,sum(unlist(counts)))
  • 53. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters}
  • 54. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 55. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 56. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 57. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 58. #!/usr/bin/python import sys from math import fabs from org.apache.pig.scripting import Pig filename = "student.txt" k = 4 tolerance = 0.01 MAX_SCORE = 4 MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle()
  • 59. if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1 if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]n")
  • 60. import java.io.IOException; import org.apache.pig.EvalFunc; import org.apache.pig.data.Tuple; public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; } }
  • 63. mapreduce(mapreduce(… mapreduce(input = c(input1, input2), …) equijoin = function( left.input, right.input, input, output, outer, map.left, map.right, reduce, reduce.all)
  • 64. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>)
  • 65. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>) abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
  • 71. input.format, output.format, format combine reduce.on.data.frame local, hadoop backends backend.parameters profiling
  • 72. input.format, output.format, format combine reduce.on.data.frame local, hadoop backends backend.parameters profiling verbose
  • 76. RHADOOP USER ONE FAT CLUSTER AVE. HYDROPOWER CITY, OR 0x0000 RHADOOP@ REVOLUTIONANALYTICS.COM

Editor's Notes

  • #2: What is R\nWhat is RHadoop\nOpen source project\nstarted by RevoLution\naims to make R and Hadoop work together\nwhat is revolution\n
  • #3: \n
  • #4: \n
  • #5: \n
  • #6: Faster, assured builds\nLarge Data extensions\nWeb deployments\nTech support\nConsulting service\nTraining\n
  • #7: \n
  • #8: hadoop bring horizontal scalability\nr sophisticated analytics\ncombination could be powerful\n
  • #9: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #10: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #11: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #12: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #13: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #14: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #15: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #16: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #17: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #18: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #19: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #20: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #21: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #22: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #23: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #24: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #25: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  • #26: A way to access big data sets\n\n
  • #27: A simple way to write parallel programs &amp;#x2013; everyone will have to\n \n\n
  • #28: Very R-like, building on the functional characteristics of R\n\n
  • #29: Just a library&amp;#xA0;\n
  • #30: \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  • #31: \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  • #32: \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  • #33: \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  • #34: \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  • #35: \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  • #36: \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  • #37: mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  • #38: mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  • #39: mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  • #40: mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  • #41: mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  • #42: mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  • #43: mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  • #44: mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  • #45: simple map example -- filtering\nreduce example -- counting\n
  • #46: simple map example -- filtering\nreduce example -- counting\n
  • #47: simple map example -- filtering\nreduce example -- counting\n
  • #48: simple map example -- filtering\nreduce example -- counting\n
  • #49: simple map example -- filtering\nreduce example -- counting\n
  • #50: simple map example -- filtering\nreduce example -- counting\n
  • #51: simple map example -- filtering\nreduce example -- counting\n
  • #52: simple map example -- filtering\nreduce example -- counting\n
  • #53: simple map example -- filtering\nreduce example -- counting\n
  • #54: simple map example -- filtering\nreduce example -- counting\n
  • #55: simple map example -- filtering\nreduce example -- counting\n
  • #56: simple map example -- filtering\nreduce example -- counting\n
  • #57: simple map example -- filtering\nreduce example -- counting\n
  • #58: easy to parametrize jobs\n
  • #59: easy to parametrize jobs\n
  • #60: easy to parametrize jobs\n
  • #61: easy to parametrize jobs\n
  • #62: easy to parametrize jobs\n
  • #63: easy to parametrize jobs\n
  • #64: second pillar of API, the memory-hdfs bridge\n
  • #65: A language like HIVE makes a class of problems easy to solve, but it is not a general tool\n The cost of doing the same operation in rmr is modest and it provides a broader set of capabilities\n
  • #66: A language like HIVE makes a class of problems easy to solve, but it is not a general tool\n The cost of doing the same operation in rmr is modest and it provides a broader set of capabilities\n
  • #67: kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  • #68: kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  • #69: kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  • #70: kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  • #71: skip quickly to other slides\nnotice three different languages\n
  • #72: \n
  • #73: \n
  • #74: more things you can do combining the elements of the API\n
  • #75: more things you can do combining the elements of the API\n
  • #76: more things you can do combining the elements of the API\n
  • #77: \n
  • #78: \n
  • #79: \n
  • #80: \n
  • #81: \n
  • #82: \n
  • #83: \n
  • #84: \n
  • #85: \n
  • #86: \n
  • #87: \n
  • #88: \n
  • #89: \n