Jan 2012 HUG: RHadoop
-
Upload
yahoo-developer-network -
Category
Technology
-
view
5.798 -
download
1
description
Transcript of Jan 2012 HUG: RHadoop
RHadoop, Hadoop for R
r4stats.com
rhdfs
rhbase
rmr
sapply(data, function)
mapreduce(data, function)
#!/usr/bin/Rscript
library(rmr)
mapreduce(…)
Hive, Pig
Rmr, Rhipe, Dumbo, Pydoop
Hadoopy
Java, C++
Cascalog, Scalding, Scrunch
Cascading, Crunch
Rmr
Expose MR Hide MR
#!/usr/bin/pythonimport sysfrom math import fabsfrom org.apache.pig.scripting import Pig
filename = "student.txt"k = 4tolerance = 0.01
MAX_SCORE = 4MIN_SCORE = 0MAX_ITERATION = 100
# initial centroid, equally divide the spaceinitial_centroids = ""last_centroids = [None] * kfor i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":"
P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """)
converged = Falseiter_num = 0while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle()
if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]\n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1
if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]\n")
import java.io.IOException;
import org.apache.pig.EvalFunc;import org.apache.pig.data.Tuple;
public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; }
}
mapreduce(input, output, map, reduce)
one or more hdfs paths or output of other mapreduce jobs
hdfs path, default to temp location
a function of two args returning a keyval(), default identity
a function of two args returning a keyval(), default none
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)
reduce = function(k, vv) keyval(k, length(vv))
condition = function(x) x >10
out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
x = from.dfs(hdfs.object)
hdfs.object = to.dfs(x)
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;
mapreduce(input =
mapreduce(input = "pv_users",
map = function(k, v) keyval(v['userid'], v['gender']),
reduce = function(k, vv) keyval(k, vv[[1]]),
output = "pv_gender_sum",
map = function(k,v) keyval(v, 1)
reduce = function(k, vv) keyval(k, sum(unlist(vv)))
kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')){ newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = lapply(values(newCenters), unlist) newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters}
kmeans.iter = function(points, distfun, ncenters = length(centers), centers = NULL) { from.dfs( mapreduce(input = points, map = if (is.null(centers)) { function(k,v)keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = lapply(centers,function(c)distfun(c,v)) keyval(centers[[which.min(distances)]],v)}}, reduce = function(k,vv) keyval(NULL,apply(do.call(rbind,vv),2,mean))))}
input.specs, output.specscombinereduce.on.data.frametuning.paramsverbose
local, hadoop backendsprofilingmanaged IOoptimize
mapreduce(mapreduce(…
mapreduce(input = c(input1, input2), …)
equijoin(left.input = input1, right.input = input2, …)
out1 = mapreduce(…)mapreduce(input = out1, <xyz>)mapreduce(input = out1, <abc>)
abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
repogithub.com/RevolutionAnalytics/
RHadoop/
licenseApache 2.0
documentationR help, github wiki
Q/Agithub issue tracking
project leadDavid Champagne