Apache spark

download Apache spark

of 21

Embed Size (px)

Transcript of Apache spark

  1. 1. Apache , Moneytap/I-Free, 2015 http://spark.apache.org/ http://i-free.com/ http://moneytapp.com/ https://vk.com/sniff303
  2. 2. ? Framework . : Map-Reduce, ( , RAM) Stream Processing
  3. 3. ? , , . Google Analytics (, ...)
  4. 4. Spark Hadoop Map-Reduce API Scala, Python Java , , Spark Streaming Apache Storm
  5. 5. ? Driver Program c . Cluster Manager , . Worker Node , , . Executor , .
  6. 6. Local Standalone (+ ZooKeeper) Hadoop YARN Apache MESOS
  7. 7. ?
  8. 8. Map-Reduce : http://xiaochongzhang.me/blog/?p=338
  9. 9. Resilient Distributed Dataset RDD - , , . , . . RDD RDD RDD transform transform
  10. 10. RDD : Local le system HDFS Cassandra HBase Amazon S3 : Plain text GZIPpped plain text Hadoop InputFormats JavaRDD localRDD = sparkContext.textFile("/path/data_2015_05_*.txt.gz"); JavaRDD hdfsRDD = sparkContext.textFile("hdfs://...");
  11. 11. RDD Transformations RDD, "" RDD. - .map(function) - .atMap(function) - .lter(function) - .sample(n) - .union(anotherRDD) - .intersection(anotherRDD) - .distinct() - .groupByKey() - .reduceByKey() - .join(anotherRDD) Actions - . - .reduce(function) - .collect() - .count() - .take(n) - .takeOrdered(n, comparator) - .foreach(function) Persistence RDD. - .saveAs...() - .persist(memoryLevel) - .unpersist() Map sdkVersions = sparkContext.textFile(filePath) .filter(s -> s.contains("AD_GET")) .map(s -> Extractors.extractSdkVersion(s)) .mapToPair(t -> new Tuple2(t._2(), 1L)) .reduceByKey((left, right) -> left + right) .collectAsMap();
  12. 12. Broadcast variables Read-only , Executor' . P2P . Accumulators , . List largeList = ...; Broadcast> broadcastVar = sparkContext.broadcast(largeList); Accumulator accum = sparkContext.accumulator(0); public class MapAccumulator implements AccumulatorParam> { @Override public Map addAccumulator(Map m1, Map m2) { for (Map.Entry m2entry : m2.entrySet()) { Long m1val = m1.get(m2entry.getKey()); if (m1val == null) { m1val = m2entry.getValue(); } else { m1val += m2entry.getValue(); } m1.put(m2entry.getKey(), m1val); } return m1; } // }
  13. 13. !
  14. 14. public class SparkExample { public static void main(String... args) { if (args.length < 3) { throw new IllegalArgumentException(); } String date = args[0]; String appId = args[1]; String network = args[2]; final String filePath = String.format("/var/stat/%s/mt/%s/%s/*.ldjson.gz", date, appId, network); SparkConf sparkConfiguration = new SparkConf().setAppName("SparkExample-" + date + "-" + appId + "-" + network); JavaSparkContext sparkContext = new JavaSparkContext(sparkConfiguration); JavaRDD> dataForApp = sparkContext.textFile(filePath) .filter(StringFilter.containJsonKeyValue("statisticEventType", "AD_GET")) .map(line -> { JsonExtractor extract = JsonExtractor.from(line); return new Tuple3( extract.visitorId(), extract.device(), extract.timestamp() ); }) .setName("SparkExampleRDD") .persist(StorageLevel.MEMORY_ONLY_SER()); Map topDevices = dataForApp.mapToPair(t -> new Tuple2(t._2(), 1L)) .reduceByKey((left, right) -> left + right) .top(50, DeviceTupleComparator.instance()) .stream() .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
  15. 15. JavaRDD> usersToSessions = dataForApp.mapToPair(t -> new Tuple2(t._1(), t._3())) .groupByKey() .flatMap(t -> { Iterator timestamps = t._2().iterator(); SessionCalculator sessions = SessionCalculator.from(timestamps); if (sessions.isAny()) { return Collections.singletonList( new Tuple2(sessions.getCount(), sessions.getApproximateLength())); } else { return Collections.emptyList(); } }); Accumulator activeUsersAccumulator = sparkContext.accumulator(0.0D); Accumulator> sessionLengthAccumulator = sparkContext.accumulator(new HashMap(), MapAccumulator.get()); Accumulator> sessionCountAccumulator = sparkContext.accumulator(new HashMap(), MapAccumulator.get()); usersToSessions.foreach(t -> { activeUsersAccumulator.add(1.0D); // active users Long count = t._1(); // session count for user Map map2 = new HashMap(); map2.put(count, 1L); sessionCountAccumulator.add(map2); Long minute = t._2(); // session length for user Map map = new HashMap(); map.put(minute, 1L); sessionLengthAccumulator.add(map); });
  16. 16. Map sessionLengthDistribution = sessionLengthAccumulator.value(); Map sessionCountDistribution = sessionCountAccumulator.value(); Long activeUsers = activeUsersAccumulator.value().longValue(); System.out.printf("topDevices: %s", topDevices); System.out.printf("sessionLengthDistribution: %s", sessionLengthDistribution); System.out.printf("sessionCountDistribution: %s", sessionCountDistribution); System.out.printf("activeUsers: %s", activeUsers); dataForApp.unpersist(true); sparkContext.stop(); } }
  17. 17. Spark 1. 2. conf/spark-env.sh 3. conf/spark-defaults.conf 4. master worker SPARK_MASTER_IP=... SPARK_WORKER_MEMORY=... spark.master=spark://... spark.executor.memory=... $ ./sbin/start-master.sh $ ./bin/spark-class org.apache.spark.deploy.worker.Worker
  18. 18. 1. fat-jar , Spark. 2. $ ./bin/spark-submit --class com.ifree.SparkExamplespark-example.jar2015-05-26c87ad063-c38f-4d2d-bbfe-d7ddfec5aab0moneytapp
  19. 19.
  20. 20. , Monit . Java . . , , Kryo. . , , . Enum Spark . , , . SparkSQL, , , . *, CRON . , spark-submit . spark.serializer=org.apache.spark.serializer.KryoSerializer
  21. 21. ! http://spark.apache.org/ http://lambda-architecture.net/ https://www.edx.org/course/introduction-big-data-apache-spark-uc-berkeleyx-cs100-1x