Jan 2012 HUG: RHadoop

18
RHadoop, Hadoop for R

description

RHadoop is an open source project aiming to combine two rising star in the analytics firmament: R and Hadoop. With more than 2M users, R is arguably the dominant language to express complex statistical computations. Hadoop needs no introduction at HUG. With RHadoop we are trying to combine the expressiveness of R with the scalability of Hadoop and to pave the way for the statistical community to tackle big data with the tools they are familiar with. At this time RHadoop is a collection of three packages that interface with HDFS, HBase and mapreduce, respectively. For mapreduce, the package is called rmr and we tried to give it a simple, high level interface that's true to the mapreduce model and integrated with the rest of the language. We will cover the API and provide some examples.

Transcript of Jan 2012 HUG: RHadoop

Page 1: Jan 2012 HUG: RHadoop

RHadoop, Hadoop for R

Page 2: Jan 2012 HUG: RHadoop

r4stats.com

Page 3: Jan 2012 HUG: RHadoop
Page 4: Jan 2012 HUG: RHadoop

rhdfs

rhbase

rmr

Page 5: Jan 2012 HUG: RHadoop

sapply(data, function)

mapreduce(data, function)

#!/usr/bin/Rscript

library(rmr)

mapreduce(…)

Page 6: Jan 2012 HUG: RHadoop

Hive, Pig

Rmr, Rhipe, Dumbo, Pydoop

Hadoopy

Java, C++

Cascalog, Scalding, Scrunch

Cascading, Crunch

Rmr

Expose MR Hide MR

Page 7: Jan 2012 HUG: RHadoop

#!/usr/bin/pythonimport sysfrom math import fabsfrom org.apache.pig.scripting import Pig

filename = "student.txt"k = 4tolerance = 0.01

MAX_SCORE = 4MIN_SCORE = 0MAX_ITERATION = 100

# initial centroid, equally divide the spaceinitial_centroids = ""last_centroids = [None] * kfor i in range(k):    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)    initial_centroids = initial_centroids + str(last_centroids[i])    if i!=k-1:        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar                   DEFINE find_centroid FindCentroid('$centroids');                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;                   grouped = group centroided by centroid;                   result = foreach grouped generate group, AVG(centroided.gpa);                   store result into 'output';                """)

converged = Falseiter_num = 0while iter_num<MAX_ITERATION:    Q = P.bind({'centroids':initial_centroids})    results = Q.runSingle()

Page 8: Jan 2012 HUG: RHadoop

    if results.isSuccessful() == "FAILED":        raise "Pig job failed"    iter = results.result("result").iterator()    centroids = [None] * k    distance_move = 0    # get new centroid of this iteration, caculate the moving distance with last iteration    for i in range(k):        tuple = iter.next()        centroids[i] = float(str(tuple.get(1)))        distance_move = distance_move + fabs(last_centroids[i]-centroids[i])    distance_move = distance_move / k;    Pig.fs("rmr output")    print("iteration " + str(iter_num))    print("average distance moved: " + str(distance_move))    if distance_move<tolerance:        sys.stdout.write("k-means converged at centroids: [")        sys.stdout.write(",".join(str(v) for v in centroids))        sys.stdout.write("]\n")        converged = True        break    last_centroids = centroids[:]    initial_centroids = ""    for i in range(k):        initial_centroids = initial_centroids + str(last_centroids[i])        if i!=k-1:            initial_centroids = initial_centroids + ":"    iter_num += 1

if not converged:    print("not converge after " + str(iter_num) + " iterations")    sys.stdout.write("last centroids: [")    sys.stdout.write(",".join(str(v) for v in last_centroids))    sys.stdout.write("]\n")

Page 9: Jan 2012 HUG: RHadoop

import java.io.IOException;

import org.apache.pig.EvalFunc;import org.apache.pig.data.Tuple;

public class FindCentroid extends EvalFunc<Double> {    double[] centroids;    public FindCentroid(String initialCentroid) {        String[] centroidStrings = initialCentroid.split(":");        centroids = new double[centroidStrings.length];        for (int i=0;i<centroidStrings.length;i++)            centroids[i] = Double.parseDouble(centroidStrings[i]);    }    @Override    public Double exec(Tuple input) throws IOException {        double min_distance = Double.MAX_VALUE;        double closest_centroid = 0;        for (double centroid : centroids) {            double distance = Math.abs(centroid - (Double)input.get(0));            if (distance < min_distance) {                min_distance = distance;                closest_centroid = centroid;            }        }        return closest_centroid;    }

}

Page 10: Jan 2012 HUG: RHadoop

mapreduce(input, output, map, reduce)

one or more hdfs paths or output of other mapreduce jobs

hdfs path, default to temp location

a function of two args returning a keyval(), default identity

a function of two args returning a keyval(), default none

Page 11: Jan 2012 HUG: RHadoop

map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)

reduce = function(k, vv) keyval(k, length(vv))

Page 12: Jan 2012 HUG: RHadoop

condition = function(x) x >10

out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))

Page 13: Jan 2012 HUG: RHadoop

x = from.dfs(hdfs.object)

hdfs.object = to.dfs(x)

Page 14: Jan 2012 HUG: RHadoop

INSERT OVERWRITE TABLE pv_gender_sum

SELECT pv_users.gender, count (DISTINCT pv_users.userid)

FROM pv_users 

GROUP BY pv_users.gender;

mapreduce(input = 

  mapreduce(input = "pv_users",  

    map = function(k, v) keyval(v['userid'], v['gender']), 

    reduce = function(k, vv) keyval(k, vv[[1]]),

  output  = "pv_gender_sum",

  map = function(k,v) keyval(v, 1)

  reduce = function(k, vv) keyval(k, sum(unlist(vv)))

Page 15: Jan 2012 HUG: RHadoop

kmeans =  function(points, ncenters, iterations = 10,            distfun = function(a,b) norm(as.matrix(a-b), type = 'F')){    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)    for(i in 1:iterations) {      newCenters = lapply(values(newCenters), unlist)      newCenters = kmeans.iter(points, distfun, centers = newCenters)}    newCenters}

kmeans.iter =  function(points, distfun, ncenters = length(centers), centers = NULL) {    from.dfs(      mapreduce(input = points,        map = if (is.null(centers)) {                function(k,v)keyval(sample(1:ncenters,1),v)}              else {                function(k,v) {                  distances = lapply(centers,function(c)distfun(c,v))                  keyval(centers[[which.min(distances)]],v)}},        reduce = function(k,vv)                    keyval(NULL,apply(do.call(rbind,vv),2,mean))))}

Page 16: Jan 2012 HUG: RHadoop

input.specs, output.specscombinereduce.on.data.frametuning.paramsverbose

local, hadoop backendsprofilingmanaged IOoptimize

Page 17: Jan 2012 HUG: RHadoop

mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)

equijoin(left.input = input1, right.input = input2, …)

out1 = mapreduce(…)mapreduce(input = out1, <xyz>)mapreduce(input = out1, <abc>)

abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}

Page 18: Jan 2012 HUG: RHadoop

repogithub.com/RevolutionAnalytics/

RHadoop/

licenseApache 2.0

documentationR help, github wiki

Q/Agithub issue tracking

[email protected]

project leadDavid Champagne