SlideShare a Scribd company logo
RHadoop, Hadoop for R
r4stats.com
Jan 2012 HUG: RHadoop
rhdfs
rhbase
rmr
sapply(data, function)
mapreduce(data, function)
#!/usr/bin/Rscript
library(rmr)
mapreduce(…)
Hive,
Pig
Rmr, Rhipe,
Dumbo,
Pydoop
Hadoopy
Java,
C++
Cascalog,
Scalding,
Scrunch
Cascading,
Crunch
Rmr
Expose MR Hide MR
#!/usr/bin/python
import sys
from math import fabs
from org.apache.pig.scripting import Pig
filename = "student.txt"
k = 4
tolerance = 0.01
MAX_SCORE = 4
MIN_SCORE = 0
MAX_ITERATION = 100
# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
initial_centroids = initial_centroids + str(last_centroids[i])
if i!=k-1:
initial_centroids = initial_centroids + ":"
P = Pig.compile("""register udf.jar
DEFINE find_centroid FindCentroid('$centroids');
raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
grouped = group centroided by centroid;
result = foreach grouped generate group, AVG(centroided.gpa);
store result into 'output';
""")
converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
Q = P.bind({'centroids':initial_centroids})
if results.isSuccessful() == "FAILED":
raise "Pig job failed"
iter = results.result("result").iterator()
centroids = [None] * k
distance_move = 0
# get new centroid of this iteration, caculate the moving distance with last iteration
for i in range(k):
tuple = iter.next()
centroids[i] = float(str(tuple.get(1)))
distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
distance_move = distance_move / k;
Pig.fs("rmr output")
print("iteration " + str(iter_num))
print("average distance moved: " + str(distance_move))
if distance_move<tolerance:
sys.stdout.write("k-means converged at centroids: [")
sys.stdout.write(",".join(str(v) for v in centroids))
sys.stdout.write("]n")
converged = True
break
last_centroids = centroids[:]
initial_centroids = ""
for i in range(k):
initial_centroids = initial_centroids + str(last_centroids[i])
if i!=k-1:
initial_centroids = initial_centroids + ":"
iter_num += 1
if not converged:
print("not converge after " + str(iter_num) + " iterations")
sys.stdout.write("last centroids: [")
sys.stdout.write(",".join(str(v) for v in last_centroids))
sys.stdout.write("]n")
import java.io.IOException;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
public class FindCentroid extends EvalFunc<Double> {
double[] centroids;
public FindCentroid(String initialCentroid) {
String[] centroidStrings = initialCentroid.split(":");
centroids = new double[centroidStrings.length];
for (int i=0;i<centroidStrings.length;i++)
centroids[i] = Double.parseDouble(centroidStrings[i]);
}
@Override
public Double exec(Tuple input) throws IOException {
double min_distance = Double.MAX_VALUE;
double closest_centroid = 0;
for (double centroid : centroids) {
double distance = Math.abs(centroid - (Double)input.get(0));
if (distance < min_distance) {
min_distance = distance;
closest_centroid = centroid;
}
}
return closest_centroid;
}
}
mapreduce(input, output, map, reduce)
one or more hdfs paths
or output of other
mapreduce jobs
hdfs path, default to
temp location
a function of two args
returning a keyval(),
default identity
a function of two args
returning a keyval(),
default none
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)
reduce = function(k, vv) keyval(k, length(vv))
condition = function(x) x >10
out = mapreduce(
input = input,
map = function(k,v)
if (condition(v)) keyval(k,v))
x = from.dfs(hdfs.object)
hdfs.object = to.dfs(x)
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;
mapreduce(input =
mapreduce(input = "pv_users",
map = function(k, v) keyval(v['userid'], v['gender']),
reduce = function(k, vv) keyval(k, vv[[1]]),
output = "pv_gender_sum",
map = function(k,v) keyval(v, 1)
reduce = function(k, vv) keyval(k, sum(unlist(vv)))
kmeans =
function(points, ncenters, iterations = 10,
distfun = function(a,b) norm(as.matrix(a-b), type = 'F')){
newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
for(i in 1:iterations) {
newCenters = lapply(values(newCenters), unlist)
newCenters = kmeans.iter(points, distfun, centers = newCenters)}
newCenters}
kmeans.iter =
function(points, distfun, ncenters = length(centers), centers = NULL) {
from.dfs(
mapreduce(input = points,
map = if (is.null(centers)) {
function(k,v)keyval(sample(1:ncenters,1),v)}
else {
function(k,v) {
distances = lapply(centers,function(c)distfun(c,v))
keyval(centers[[which.min(distances)]],v)}},
reduce = function(k,vv)
keyval(NULL,apply(do.call(rbind,vv),2,mean))))}
input.specs, output.specs
combine
reduce.on.data.frame
tuning.params
verbose
local, hadoop backends
profiling
managed IO
optimize
mapreduce(mapreduce(…
mapreduce(input = c(input1, input2), …)
equijoin(left.input = input1, right.input = input2, …)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)
abstract.job = function(input, output, …) {
…
result = mapreduce(input = input, output = output)
…
result}
repo
github.com/RevolutionAnalytics/RHad
oop/
license
Apache 2.0
documentation
R help, github wiki
Q/A
github issue tracking
email
rhadoop@revolutionanalytics.com
project lead
David Champagne

More Related Content

PDF
Faster Python, FOSDEM
PDF
All I know about rsc.io/c2go
PDF
bpftrace - Tracing Summit 2018
PDF
Phil Bartie QGIS PLPython
PDF
A tour of Python
PDF
OCamlOScope: a New OCaml API Search
PPTX
TCO in Python via bytecode manipulation.
PDF
Docopt
Faster Python, FOSDEM
All I know about rsc.io/c2go
bpftrace - Tracing Summit 2018
Phil Bartie QGIS PLPython
A tour of Python
OCamlOScope: a New OCaml API Search
TCO in Python via bytecode manipulation.
Docopt

What's hot (20)

DOCX
Surface3d in R and rgl package.
PDF
Rcpp11 genentech
PDF
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...
DOCX
Mosaic plot in R.
PDF
Pythran: Static compiler for high performance by Mehdi Amini PyData SV 2014
PDF
Python opcodes
PDF
Parallel Computing with R
PDF
Python grass
PDF
Python profiling
PPTX
Groovy
PDF
Bytes in the Machine: Inside the CPython interpreter
PDF
Golang勉強会
PPTX
Hacking Go Compiler Internals / GoCon 2014 Autumn
PDF
"A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!...
PPT
Prgišče Lispa
PDF
Building a DSL with GraalVM (CodeOne)
PDF
R/C++ talk at earl 2014
ODP
The secrets of inverse brogramming
KEY
Generating and Analyzing Events
PDF
Building a DSL with GraalVM (VoxxedDays Luxembourg)
Surface3d in R and rgl package.
Rcpp11 genentech
Allison Kaptur: Bytes in the Machine: Inside the CPython interpreter, PyGotha...
Mosaic plot in R.
Pythran: Static compiler for high performance by Mehdi Amini PyData SV 2014
Python opcodes
Parallel Computing with R
Python grass
Python profiling
Groovy
Bytes in the Machine: Inside the CPython interpreter
Golang勉強会
Hacking Go Compiler Internals / GoCon 2014 Autumn
"A 1,500 line (!!) switch statement powers your Python!" - Allison Kaptur, !!...
Prgišče Lispa
Building a DSL with GraalVM (CodeOne)
R/C++ talk at earl 2014
The secrets of inverse brogramming
Generating and Analyzing Events
Building a DSL with GraalVM (VoxxedDays Luxembourg)
Ad

Similar to Jan 2012 HUG: RHadoop (20)

PDF
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Al...
KEY
RHadoop の紹介
PDF
Refactoring to Macros with Clojure
PDF
Java VS Python
KEY
R meets Hadoop
PDF
Spark workshop
PDF
Hw09 Hadoop + Clojure
PDF
Hadoop + Clojure
PDF
Damn Fine CoffeeScript
PDF
Apache Spark: Moving on from Hadoop
KEY
RHadoop, R meets Hadoop
PDF
Monadologie
PDF
Internal workshop es6_2015
PDF
Emerging Languages: A Tour of the Horizon
PDF
RxJS Evolved
PDF
PDF
A bit about Scala
PDF
Introduction to Scalding and Monoids
PPTX
Modern technologies in data science
TXT
R code
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Al...
RHadoop の紹介
Refactoring to Macros with Clojure
Java VS Python
R meets Hadoop
Spark workshop
Hw09 Hadoop + Clojure
Hadoop + Clojure
Damn Fine CoffeeScript
Apache Spark: Moving on from Hadoop
RHadoop, R meets Hadoop
Monadologie
Internal workshop es6_2015
Emerging Languages: A Tour of the Horizon
RxJS Evolved
A bit about Scala
Introduction to Scalding and Monoids
Modern technologies in data science
R code
Ad

More from Yahoo Developer Network (20)

PDF
Developing Mobile Apps for Performance - Swapnil Patel, Verizon Media
PDF
Athenz - The Open-Source Solution to Provide Access Control in Dynamic Infras...
PDF
Athenz & SPIFFE, Tatsuya Yano, Yahoo Japan
PDF
Athenz with Istio - Single Access Control Model in Cloud Infrastructures, Tat...
PDF
CICD at Oath using Screwdriver
PDF
Big Data Serving with Vespa - Jon Bratseth, Distinguished Architect, Oath
PPTX
How @TwitterHadoop Chose Google Cloud, Joep Rottinghuis, Lohit VijayaRenu
PDF
The Future of Hadoop in an AI World, Milind Bhandarkar, CEO, Ampool
PPTX
Apache YARN Federation and Tez at Microsoft, Anupam Upadhyay, Adrian Nicoara,...
PPTX
Containerized Services on Apache Hadoop YARN: Past, Present, and Future, Shan...
PDF
HDFS Scalability and Security, Daryn Sharp, Senior Engineer, Oath
PPTX
Hadoop {Submarine} Project: Running deep learning workloads on YARN, Wangda T...
PDF
Moving the Oath Grid to Docker, Eric Badger, Oath
PDF
Architecting Petabyte Scale AI Applications
PDF
Introduction to Vespa – The Open Source Big Data Serving Engine, Jon Bratseth...
PPTX
Jun 2017 HUG: YARN Scheduling – A Step Beyond
PDF
Jun 2017 HUG: Large-Scale Machine Learning: Use Cases and Technologies
PPTX
February 2017 HUG: Slow, Stuck, or Runaway Apps? Learn How to Quickly Fix Pro...
PPTX
February 2017 HUG: Exactly-once end-to-end processing with Apache Apex
PPTX
February 2017 HUG: Data Sketches: A required toolkit for Big Data Analytics
Developing Mobile Apps for Performance - Swapnil Patel, Verizon Media
Athenz - The Open-Source Solution to Provide Access Control in Dynamic Infras...
Athenz & SPIFFE, Tatsuya Yano, Yahoo Japan
Athenz with Istio - Single Access Control Model in Cloud Infrastructures, Tat...
CICD at Oath using Screwdriver
Big Data Serving with Vespa - Jon Bratseth, Distinguished Architect, Oath
How @TwitterHadoop Chose Google Cloud, Joep Rottinghuis, Lohit VijayaRenu
The Future of Hadoop in an AI World, Milind Bhandarkar, CEO, Ampool
Apache YARN Federation and Tez at Microsoft, Anupam Upadhyay, Adrian Nicoara,...
Containerized Services on Apache Hadoop YARN: Past, Present, and Future, Shan...
HDFS Scalability and Security, Daryn Sharp, Senior Engineer, Oath
Hadoop {Submarine} Project: Running deep learning workloads on YARN, Wangda T...
Moving the Oath Grid to Docker, Eric Badger, Oath
Architecting Petabyte Scale AI Applications
Introduction to Vespa – The Open Source Big Data Serving Engine, Jon Bratseth...
Jun 2017 HUG: YARN Scheduling – A Step Beyond
Jun 2017 HUG: Large-Scale Machine Learning: Use Cases and Technologies
February 2017 HUG: Slow, Stuck, or Runaway Apps? Learn How to Quickly Fix Pro...
February 2017 HUG: Exactly-once end-to-end processing with Apache Apex
February 2017 HUG: Data Sketches: A required toolkit for Big Data Analytics

Recently uploaded (20)

PDF
Encapsulation theory and applications.pdf
PDF
Building Integrated photovoltaic BIPV_UPV.pdf
PDF
TokAI - TikTok AI Agent : The First AI Application That Analyzes 10,000+ Vira...
PPTX
sap open course for s4hana steps from ECC to s4
PDF
Build a system with the filesystem maintained by OSTree @ COSCUP 2025
PDF
Diabetes mellitus diagnosis method based random forest with bat algorithm
PDF
Network Security Unit 5.pdf for BCA BBA.
PDF
Optimiser vos workloads AI/ML sur Amazon EC2 et AWS Graviton
PPTX
Digital-Transformation-Roadmap-for-Companies.pptx
PPTX
Effective Security Operations Center (SOC) A Modern, Strategic, and Threat-In...
PDF
The Rise and Fall of 3GPP – Time for a Sabbatical?
PDF
Review of recent advances in non-invasive hemoglobin estimation
PPTX
ACSFv1EN-58255 AWS Academy Cloud Security Foundations.pptx
PDF
Approach and Philosophy of On baking technology
PDF
Per capita expenditure prediction using model stacking based on satellite ima...
PDF
Machine learning based COVID-19 study performance prediction
PPTX
Spectroscopy.pptx food analysis technology
PPT
Teaching material agriculture food technology
PDF
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf
PPTX
KOM of Painting work and Equipment Insulation REV00 update 25-dec.pptx
Encapsulation theory and applications.pdf
Building Integrated photovoltaic BIPV_UPV.pdf
TokAI - TikTok AI Agent : The First AI Application That Analyzes 10,000+ Vira...
sap open course for s4hana steps from ECC to s4
Build a system with the filesystem maintained by OSTree @ COSCUP 2025
Diabetes mellitus diagnosis method based random forest with bat algorithm
Network Security Unit 5.pdf for BCA BBA.
Optimiser vos workloads AI/ML sur Amazon EC2 et AWS Graviton
Digital-Transformation-Roadmap-for-Companies.pptx
Effective Security Operations Center (SOC) A Modern, Strategic, and Threat-In...
The Rise and Fall of 3GPP – Time for a Sabbatical?
Review of recent advances in non-invasive hemoglobin estimation
ACSFv1EN-58255 AWS Academy Cloud Security Foundations.pptx
Approach and Philosophy of On baking technology
Per capita expenditure prediction using model stacking based on satellite ima...
Machine learning based COVID-19 study performance prediction
Spectroscopy.pptx food analysis technology
Teaching material agriculture food technology
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf
KOM of Painting work and Equipment Insulation REV00 update 25-dec.pptx

Jan 2012 HUG: RHadoop

  • 7. #!/usr/bin/python import sys from math import fabs from org.apache.pig.scripting import Pig filename = "student.txt" k = 4 tolerance = 0.01 MAX_SCORE = 4 MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids})
  • 8. if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1 if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]n")
  • 9. import java.io.IOException; import org.apache.pig.EvalFunc; import org.apache.pig.data.Tuple; public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; } }
  • 10. mapreduce(input, output, map, reduce) one or more hdfs paths or output of other mapreduce jobs hdfs path, default to temp location a function of two args returning a keyval(), default identity a function of two args returning a keyval(), default none
  • 11. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 12. condition = function(x) x >10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 14. INSERT OVERWRITE TABLE pv_gender_sum SELECT pv_users.gender, count (DISTINCT pv_users.userid) FROM pv_users GROUP BY pv_users.gender; mapreduce(input = mapreduce(input = "pv_users", map = function(k, v) keyval(v['userid'], v['gender']), reduce = function(k, vv) keyval(k, vv[[1]]), output = "pv_gender_sum", map = function(k,v) keyval(v, 1) reduce = function(k, vv) keyval(k, sum(unlist(vv)))
  • 15. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')){ newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = lapply(values(newCenters), unlist) newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = length(centers), centers = NULL) { from.dfs( mapreduce(input = points, map = if (is.null(centers)) { function(k,v)keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = lapply(centers,function(c)distfun(c,v)) keyval(centers[[which.min(distances)]],v)}}, reduce = function(k,vv) keyval(NULL,apply(do.call(rbind,vv),2,mean))))}
  • 17. mapreduce(mapreduce(… mapreduce(input = c(input1, input2), …) equijoin(left.input = input1, right.input = input2, …) out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>) abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
  • 18. repo github.com/RevolutionAnalytics/RHad oop/ license Apache 2.0 documentation R help, github wiki Q/A github issue tracking email rhadoop@revolutionanalytics.com project lead David Champagne

Editor's Notes

  • #2: What is RHadoop Open source project started by Revo aims to make R and Hadoop work together
  • #3: Intro to R language for statistics replacement for S most popular??
  • #4: Hottest people in Hollywood Must get together Is it a good idea? Available analysts Trend towards high level language Polyglot infrastructure
  • #5: Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family Hadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level
  • #6: A way to access big data sets A simple way to write parallel programs – everyone will have to Very R-like, building on the functional characteristics of R Just a library 
  • #7: Much simpler than writing Java Not as simple as Hive, Pig at what they do, but more general Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.
  • #8: skip quickly to other slides notice three different languages
  • #15: Takeaways A language like HIVE makes a class of problems easy to solve, but it is not a general tool The cost of doing the same operation in rmr is modest and it provides a broader set of capabilities