Jan 2012 HUG: RHadoop

Post on 06-May-2015

5.798 views 1 download

description

RHadoop is an open source project aiming to combine two rising star in the analytics firmament: R and Hadoop. With more than 2M users, R is arguably the dominant language to express complex statistical computations. Hadoop needs no introduction at HUG. With RHadoop we are trying to combine the expressiveness of R with the scalability of Hadoop and to pave the way for the statistical community to tackle big data with the tools they are familiar with. At this time RHadoop is a collection of three packages that interface with HDFS, HBase and mapreduce, respectively. For mapreduce, the package is called rmr and we tried to give it a simple, high level interface that's true to the mapreduce model and integrated with the rest of the language. We will cover the API and provide some examples.

Transcript of Jan 2012 HUG: RHadoop

RHadoop, Hadoop for R

r4stats.com

rhdfs

rhbase

rmr

sapply(data, function)

mapreduce(data, function)

#!/usr/bin/Rscript

library(rmr)

mapreduce(…)

Hive, Pig

Rmr, Rhipe, Dumbo, Pydoop

Hadoopy

Java, C++

Cascalog, Scalding, Scrunch

Cascading, Crunch

Rmr

Expose MR Hide MR

#!/usr/bin/pythonimport sysfrom math import fabsfrom org.apache.pig.scripting import Pig

filename = "student.txt"k = 4tolerance = 0.01

MAX_SCORE = 4MIN_SCORE = 0MAX_ITERATION = 100

# initial centroid, equally divide the spaceinitial_centroids = ""last_centroids = [None] * kfor i in range(k):    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)    initial_centroids = initial_centroids + str(last_centroids[i])    if i!=k-1:        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar                   DEFINE find_centroid FindCentroid('$centroids');                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;                   grouped = group centroided by centroid;                   result = foreach grouped generate group, AVG(centroided.gpa);                   store result into 'output';                """)

converged = Falseiter_num = 0while iter_num<MAX_ITERATION:    Q = P.bind({'centroids':initial_centroids})    results = Q.runSingle()

    if results.isSuccessful() == "FAILED":        raise "Pig job failed"    iter = results.result("result").iterator()    centroids = [None] * k    distance_move = 0    # get new centroid of this iteration, caculate the moving distance with last iteration    for i in range(k):        tuple = iter.next()        centroids[i] = float(str(tuple.get(1)))        distance_move = distance_move + fabs(last_centroids[i]-centroids[i])    distance_move = distance_move / k;    Pig.fs("rmr output")    print("iteration " + str(iter_num))    print("average distance moved: " + str(distance_move))    if distance_move<tolerance:        sys.stdout.write("k-means converged at centroids: [")        sys.stdout.write(",".join(str(v) for v in centroids))        sys.stdout.write("]\n")        converged = True        break    last_centroids = centroids[:]    initial_centroids = ""    for i in range(k):        initial_centroids = initial_centroids + str(last_centroids[i])        if i!=k-1:            initial_centroids = initial_centroids + ":"    iter_num += 1

if not converged:    print("not converge after " + str(iter_num) + " iterations")    sys.stdout.write("last centroids: [")    sys.stdout.write(",".join(str(v) for v in last_centroids))    sys.stdout.write("]\n")

import java.io.IOException;

import org.apache.pig.EvalFunc;import org.apache.pig.data.Tuple;

public class FindCentroid extends EvalFunc<Double> {    double[] centroids;    public FindCentroid(String initialCentroid) {        String[] centroidStrings = initialCentroid.split(":");        centroids = new double[centroidStrings.length];        for (int i=0;i<centroidStrings.length;i++)            centroids[i] = Double.parseDouble(centroidStrings[i]);    }    @Override    public Double exec(Tuple input) throws IOException {        double min_distance = Double.MAX_VALUE;        double closest_centroid = 0;        for (double centroid : centroids) {            double distance = Math.abs(centroid - (Double)input.get(0));            if (distance < min_distance) {                min_distance = distance;                closest_centroid = centroid;            }        }        return closest_centroid;    }

}

mapreduce(input, output, map, reduce)

one or more hdfs paths or output of other mapreduce jobs

hdfs path, default to temp location

a function of two args returning a keyval(), default identity

a function of two args returning a keyval(), default none

map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)

reduce = function(k, vv) keyval(k, length(vv))

condition = function(x) x >10

out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))

x = from.dfs(hdfs.object)

hdfs.object = to.dfs(x)

INSERT OVERWRITE TABLE pv_gender_sum

SELECT pv_users.gender, count (DISTINCT pv_users.userid)

FROM pv_users 

GROUP BY pv_users.gender;

mapreduce(input = 

  mapreduce(input = "pv_users",  

    map = function(k, v) keyval(v['userid'], v['gender']), 

    reduce = function(k, vv) keyval(k, vv[[1]]),

  output  = "pv_gender_sum",

  map = function(k,v) keyval(v, 1)

  reduce = function(k, vv) keyval(k, sum(unlist(vv)))

kmeans =  function(points, ncenters, iterations = 10,            distfun = function(a,b) norm(as.matrix(a-b), type = 'F')){    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)    for(i in 1:iterations) {      newCenters = lapply(values(newCenters), unlist)      newCenters = kmeans.iter(points, distfun, centers = newCenters)}    newCenters}

kmeans.iter =  function(points, distfun, ncenters = length(centers), centers = NULL) {    from.dfs(      mapreduce(input = points,        map = if (is.null(centers)) {                function(k,v)keyval(sample(1:ncenters,1),v)}              else {                function(k,v) {                  distances = lapply(centers,function(c)distfun(c,v))                  keyval(centers[[which.min(distances)]],v)}},        reduce = function(k,vv)                    keyval(NULL,apply(do.call(rbind,vv),2,mean))))}

input.specs, output.specscombinereduce.on.data.frametuning.paramsverbose

local, hadoop backendsprofilingmanaged IOoptimize

mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)

equijoin(left.input = input1, right.input = input2, …)

out1 = mapreduce(…)mapreduce(input = out1, <xyz>)mapreduce(input = out1, <abc>)

abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}

repogithub.com/RevolutionAnalytics/

RHadoop/

licenseApache 2.0

documentationR help, github wiki

Q/Agithub issue tracking

emailrhadoop@revolutionanalytics.com

project leadDavid Champagne