若作业提交时以yarn-client模式提交,Driver运行时的JVM参数是spark在spark-env.sh 配置文件中读取,因此在spark-env.sh中加入配置:SPARK_DRIVER_MEMORY=2G

作业在每个worker上运行时报错:Exception in thread "main" java.lang.OutOfMemoryError: Java heap space

此时需要配置每个worker运行的MEMORY参数,在spark-env.sh中配置:

export SPARK_WORKER_MEMORY=4g

export SPARK_WORKER_CORES=4

scala> val ds1 = spark.range(1, 100000)
ds1: org.apache.spark.sql.Dataset[Long] = [id: bigint]

scala> val ds2 = spark.range(1, 100000, 2)
ds2: org.apache.spark.sql.Dataset[Long] = [id: bigint]

scala> val ds3 = ds1.repartition(7) 
ds3: org.apache.spark.sql.Dataset[Long] = [id: bigint]

scala> val ds4 = ds2.repartition(12)
ds4: org.apache.spark.sql.Dataset[Long] = [id: bigint]

scala> val ds5 = ds3.selectExpr(" id * 5 as id") 
ds5: org.apache.spark.sql.DataFrame = [id: bigint]

scala> val joined =  ds5.join(ds4, "id") 
joined: org.apache.spark.sql.DataFrame = [id: bigint]

scala> val sum = joined.selectExpr(" sum(id)  as result")
sum: org.apache.spark.sql.DataFrame = [result: bigint]

scala> sum.show()
+---------+
|   result|
+---------+
|500000000|

如果放大上面的数据,就会产生OOM.



scala> val ds1 = spark.range(1, 10000000)
ds1: org.apache.spark.sql.Dataset[Long] = [id: bigint]

scala> val ds2 = spark.range(1, 10000000, 2)
ds2: org.apache.spark.sql.Dataset[Long] = [id: bigint]

scala> val ds3 = ds1.repartition(7) 
ds3: org.apache.spark.sql.Dataset[Long] = [id: bigint]

scala> val ds4 = ds2.repartition(12)
ds4: org.apache.spark.sql.Dataset[Long] = [id: bigint]

scala> val ds5 = ds3.selectExpr(" id * 5 as id") 
ds5: org.apache.spark.sql.DataFrame = [id: bigint]

scala> val joined =  ds5.join(ds4, "id") 
joined: org.apache.spark.sql.DataFrame = [id: bigint]

scala> val sum = joined.selectExpr(" sum(id)  as result")
sum: org.apache.spark.sql.DataFrame = [result: bigint]

scala> sum.show() ;