Apache Spark ƒƒ¥ƒ¼ƒˆƒ‚¢ƒ«

download Apache Spark ƒƒ¥ƒ¼ƒˆƒ‚¢ƒ«

of 97

  • date post

    15-Jul-2015
  • Category

    Software

  • view

    3.904
  • download

    4

Embed Size (px)

Transcript of Apache Spark ƒƒ¥ƒ¼ƒˆƒ‚¢ƒ«

Hadoop MapReduce

Apache Spark

2015-04-28MapReduce123frequency = defaultdict(int)

for line in opened_file: for word in some_splitter(line): frequency[word] += 1

for word in frequency: some_output(word, frequency[word])

frequency///Map(Pythonic)frequency = collections.Counter( word for line in opened_file for word in some_splitter(line))for word, count in frequency.iteritems(): some_output(word, count)

34:for line in opened_file: for word in some_splitter(line): frequency[word] += 1:!!56frequency = defaultdict(int)

for line in opend_file: for word in some_splitter(line): if hash(word) % 2 == 1: frequency[word] += 1:frequency = defaultdict(int)

for line in opend_file: for word in some_splitter(line): if hash(word) % 2 == 0: frequency[word] += 1:

OK7f = [defaultdict(int) for i in range(2)]

for l in of: for w in sp(l): f[hash(w) % 2]\ [w] += 1

OK

10010,0008Hadoop

Hadoop

99HadoopMapReduce10/* * * FileInputFormat1LongWriteable, 2text * 3mapkey, 4mapvalue */public static class Map extends Mapper {

private Text outKey = new Text("LINES"); private LongWritable outValue = new LongWritable(1);

@Override protected void map(LongWritable inputKey, Text value, Context context) throws IOException, InterruptedException { context.write(outKey, outValue); }}

/* * */public static class Reduce extends Reducer { private LongWritable outValue = new LongWritable();

@Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { long total = 0L;

for (LongWritable value : values) { total += value.get(); }

outValue.set(total);

context.write(key, outValue); }}MapReduce10Apache Spark 11SparkHadoop MapReduceHadoop-HDFS

ScalaPythonJava12SparkScalaPythonMapReduceSparkHDFS

SparkHadoop-MapReduce13Spark5

14(Python)sc.textFile(u'tweets.txt').\ map(lambda x: x.split('\t')[4]).\ filter(lambda x: u'' in x)Spark5

15(Scala)sc.textFile("tweets.txt"). map(_.split("\t")(4)). filter(_.contains(""))Spark

Hadoop-YARN16/executor()6

spark-shell17spark-shell --master yarn-client --num-executors 6Spark18node00node01node02node03node04node05node20driverexecutorexecutormasterexecutorexecutorexecutorexecutorexecutorexecutorexecutorexecutorexecutorshelldriver19node00node01node02node03node04node05node20driverexecutorexecutormasterexecutorexecutorexecutorexecutorexecutorexecutorexecutorexecutorexecutorspark-shell)

WebUI

executorexecutor20node00node01node02node03node04node05node20driverexecutorexecutormasterexecutorexecutorexecutorexecutorexecutorexecutorexecutorexecutorexecutormaster21node00node01node02node03node04node05node20driverexecutorexecutormasterexecutorexecutorexecutorexecutorexecutorexecutorexecutorexecutorexecutorSparkHadoop YARN

YARN Resource ManagerSpark22Spark2executor23#Mac Homebrewbrew install apache-spark

#Scalaspark-shell --master "local[2]"

#Pythonpyspark --master "local[2]"WebUI24

http://localhost:4040/jobs/WebUIa.reduce2b.reduce1Python25(Python)# 0999999999from operator import adda = sc.parallelize(xrange(1000000000), 2)b = sc.parallelize(xrange(1000000000), 1)

a.reduce(add)499999999500000000

b.reduce(add)499999999500000000Scala26(Scala)// 0999999999val a = sc.parallelize(1L until 1000000000, 2)val b = sc.parallelize(1L until 1000000000, 1)

a.reduce(_ + _)499999999500000000

b.reduce(_ + _)499999999500000000SparkContextscspark-shellSparkContext

SparkContextexecutor

27a = sc.parallelize(xrange(1000000000), 2)RDD "Resilient Distributed Dataset"

(map, filter,,)

, 28RDD : RDDSparkContext29(Python)sc.parallelize([1, 2, 3, 4, 5])# RDD

sc.textFile("file.text")sc.textFile("directory/*.gz")# # RDD# # Hadoop-YARNHDFS# AWSS3# # Spark-ShellRDD : 30(Python)nums = sc.parallelize([4, 3, 2, 5, 1])

nums.collect()# => [4, 3, 2, 5, 1]#

nums.take(3)# => [4, 3, 2] 3

nums.top(3)# => [5, 4, 3] 3RDD, , Pythoncmp()Java/ScalaComparableRDD : 31(Python)nums = sc.parallelize(xrange(1000))

nums.count()# => 1000#

nums.reduce(lambda x, y: x + y)# => 499500# ((((((0+1)+2)+3)++998)+999)

nums.saveAsTextFile("destination")# RDD, , RDD : RDDRDDRDD32(Python)nums = sc.parallelize([1, 2, 3])

nums.map(lambda x: x * x)# => [1, 4, 9]

nums.filter(lambda x: x % 2 == 0)# => [2]nums.flatMap(lambda x: range(x))# => [0, 0, 1, 0, 1, 2]# 1RDD : 2-value tuple2RDD33(Python)pets = sc.parallelize( [("cat", 1), ("dog", 1), ("cat", 2)])

pets.groupByKey()# => [("cat", [1, 2]), ("dog", [1])]

pets.reduceByKey(lambda x, y: x + y)# => [("cat", 3), ("dog", 1)]

pets.sortByKey()# => [(cat, 1), (cat, 2), (dog, 1)]

RDD : 22-value tuple2RDD34(Python)pets = sc.parallelize( [("cat", 1), ("dog", 1), ("cat", 2)])names = sc.parallelize( [("cat", "Tama"), ("dog", "Pochi")])

pets.join(names)# => [("dog", (1, "Pochi")),# ("cat", (1, "Tama")),# ("cat", (2, "Tama"))]

pets.cogroup(names)# => [("dog", ([1], ["Pochi"])),# ("cat", ([1,2], ["Tama"]))]

RDD : 35foreachcollectreducefoldcountsaveAsTextFilemapflatMapfiltercollectdistinctsamplesortByKeygroupByKeyreduceByKey

zipcartesianunionintersectionsubtractrepartitionjoincogroupetctakefirsttoptakeOrdered

etccacheunpersistcheckpointetccollectPythonRDD : http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.RDD

http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD

http://www.ne.jp/asahi/hishidama/home/tech/scala/spark/RDD.html

36mezcalSpark1.2.0RDD3738node1node2node3executorexecutorexecutorexecutor3(Python)sc.parallelize(xrange(100), 3) #3sc.count() # 30123:3233343536:6566676869:99RDD39(Python)sc.parallelize(xrange(100), 1) #1sc.count() # 1node1node2node3executorexecutorexecutor012:9899RDDexecutor340executor3(Python)sc.textFile("/work/test/data.txt") # 128MBsc.count() # 1HDFSnode1node2node3executorexecutorexecutordata.txtRDDHDFS128MB

bzip2414142executor3(Python)sc.textFile("/work/test/large.txt") # 300MBsc.count() # 3node1node2node3executorexecutorexecutorlarge.txt(block1)large.txt(block2)large.txt(block3)RDD128MB?434344executor3(Python)sc.textFile("/work/test/large.txt") # 300MBsc.count() # 3node1node2node3executorexecutorexecutorlarge.txt(block1)large.txt(block2)large.txt(block3)RDDgzip128MB!!128MB454546node1node2node3executorexecutorexecutordata.gz(block1)data.gz(block2)data.gz(block3)HDFSSpark300MBgzip47executor3(Python)sc.textFile("/work/test/data.gz") # 300MBsc.count() # 1node1node2node3executorexecutorexecutordata.gz(block1)data.gz(block2)data.gz(block3)data.gz(block2)data.gz(block3)RDD48executor3(Python)sc.textFile("/work/test/p*.gz") # 3128MBsc.count() # 3node1node2node3executorexecutorexecutorp0.gzp1.gzp2.gzRDDSpark

49RDDRDD50(Python)src = sc.textFile("hightemp.txt")# RDD

tuples = src .map(lambda x: x.split("\t")) .filter(lambda x: len(x) > 3)

# 10012col1 = tuples.map(lambda x: x[0])col2 = tuples.map(lambda x: x[1])tab412\t\t41\t2013-08-12\t\t40.9\t2007-08-16\t\t40.9\t2007-08-16\t\t40.8\t1933-07-25\t\t40.7\t2013-08-10:

51()col1.count()24# # split->filter->map->count

col1.saveAsTextFile("col1")# # split->filter->map->saveAsTextFile

# col1# col1# 2(part-00000 part-00001)52()#

col1.cache()#

col1.count()24# split->filter->map->count

col1.distinct().count()12# 10017

col1.take(2)[u'\u9ad8\u77e5\u770c', u'\u57fc\u7389\u770c']53# 2sorted_by_col2 = tuples.sortBy(lambda x: x[1])

# 310018sorted_by_col3 = tuples .sortBy(lambda x: float(x[2]), False)

# 110019from operator import addfrequency = col1.map(lambda x: (x, 1)) .reduceByKey(add) .sortBy(lambda x: x[1], False)

# RDD1001954 41 2013-08-12 40.9 2007-08-16 40.9 2007-08-16 40.8 1933-07-25String[,,41,2013-08-12][,,40.9,2007-08-16][,,40.9,2007-08-16][,,40.8,1933-07-25]String(,1)(,1)(,1)(,1)(String, Int)(,[1].reduce(add))(,[1,1,1].reduce(add))(,[1,1].reduce(add))(,[1,1,1].reduce(add))(String, Int)map(x.split("\t"))filter(len(x)>3)map(x[0])map((x, 1))reduceByKey(add)Array[String]col2sortBy(x[1], False)tuplesScala(,3)(,3)(,3)(,3)frequency55()import jsondef pp(obj): print json.dumps(obj, ensure_ascii=False)

pp(sorted_by_col3.take(3))[["", "", "41", "2013-08-12"], ["", "", "40.9", "2007-08-16"], ["", "", "40.9", "2007-08-16"]]# 3

pp(col1.takeOrdered(3))["", "", ""]# 3pp(col1.top(3))["", "", ""]# 356()for data in frequency.take(10): print u'({}, {})'.format(*data)(, 3)(, 3)(, 3)(, 3)(, 2)(, 2)(, 2)(, 2)(, 1)(, 1)Spark

PMI5758val src = sc.textFile("tuples.tsv")

val tuples = src. map(_.split("\t")). filter(_.size > 1)

val aFreq = tuples. map( t => (t(0), 1L) ).reduceByKey(_+_)

val bFreq = tuples. map( t => (t(1), 1L) ).reduceByKey(_+_)

val instancesFreq = tuples. map( t => ((t(0), t(1)), 1L) ).reduceByKey(_+_) Scala combination\toffersalabama\thomewedding\tgreekevil\tdead:

59combinationoffersalabamahomeweddinggreekevildeadString[combination, offers][alabama, home][wedding, greek][evil, dead](offers, 81)(home, 36)(greek, 24)(dead, 20)(String, Int)map(_.split("\t"))filter(_.size>1)Array[String](combination, 20)(alabama, 40)(wedding, 40)(evil, 16)(String, Int)((combination, offers), 1)((alabama, home), 5)((wedding, greek), 5)((evil, dead), 3)((String, String), Int)mapredu