Spark的OOM分析
若作业提交时以yarn-client模式提交,Driver运行时的JVM参数是spark在spark-env.sh 配置文件中读取,因此在spark-env.sh中加入配置:SPARK_DRIVER_MEMORY=2G
作业在每个worker上运行时报错:Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
此时需要配置每个worker运行的MEMORY参数,在spark-env.sh中配置:
export SPARK_WORKER_MEMORY=4g
export SPARK_WORKER_CORES=4
scala> val ds1 = spark.range(1, 100000)
ds1: org.apache.spark.sql.Dataset[Long] = [id: bigint]
scala> val ds2 = spark.range(1, 100000, 2)
ds2: org.apache.spark.sql.Dataset[Long] = [id: bigint]
scala> val ds3 = ds1.repartition(7)
ds3: org.apache.spark.sql.Dataset[Long] = [id: bigint]
scala> val ds4 = ds2.repartition(12)
ds4: org.apache.spark.sql.Dataset[Long] = [id: bigint]
scala> val ds5 = ds3.selectExpr(" id * 5 as id")
ds5: org.apache.spark.sql.DataFrame = [id: bigint]
scala> val joined = ds5.join(ds4, "id")
joined: org.apache.spark.sql.DataFrame = [id: bigint]
scala> val sum = joined.selectExpr(" sum(id) as result")
sum: org.apache.spark.sql.DataFrame = [result: bigint]
scala> sum.show()
+---------+
| result|
+---------+
|500000000|
如果放大上面的数据,就会产生OOM.
scala> val ds1 = spark.range(1, 10000000)
ds1: org.apache.spark.sql.Dataset[Long] = [id: bigint]
scala> val ds2 = spark.range(1, 10000000, 2)
ds2: org.apache.spark.sql.Dataset[Long] = [id: bigint]
scala> val ds3 = ds1.repartition(7)
ds3: org.apache.spark.sql.Dataset[Long] = [id: bigint]
scala> val ds4 = ds2.repartition(12)
ds4: org.apache.spark.sql.Dataset[Long] = [id: bigint]
scala> val ds5 = ds3.selectExpr(" id * 5 as id")
ds5: org.apache.spark.sql.DataFrame = [id: bigint]
scala> val joined = ds5.join(ds4, "id")
joined: org.apache.spark.sql.DataFrame = [id: bigint]
scala> val sum = joined.selectExpr(" sum(id) as result")
sum: org.apache.spark.sql.DataFrame = [result: bigint]
scala> sum.show() ;