Merge branch 'master' into predictor-support-for-writing-gtiff

pomadchin · pomadchin · commit 4375159e923d · 2025-09-20T16:32:11.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
-- Add ZStd compression support for GTiff
+- Add ZStd compression support for GTiff [#3580](https://github.com/locationtech/geotrellis/pull/3580)
+- Do not depend on private Spark API, avoids sealing violation [#3586](https://github.com/locationtech/geotrellis/pull/3586)
 - Add predictor 2 (integer) and predictor 3 (float) support for writing compressed GTiff files [#3588](https://github.com/locationtech/geotrellis/pull/3588)
 
 ## [3.8.0] - 2025-04-23
diff --git a/raster/src/main/scala/geotrellis/raster/io/geotiff/compression/Compressor.scala b/raster/src/main/scala/geotrellis/raster/io/geotiff/compression/Compressor.scala
@@ -28,12 +28,11 @@ trait Compressor extends Serializable {
     new Compressor {
       def wrapped: Compressor = Compressor.this
 
-      override def compress(bytes: Array[Byte], segmentIndex: Int): Array[Byte] = {
+      def compress(bytes: Array[Byte], segmentIndex: Int): Array[Byte] =
         wrapped.compress(predictor.encode(bytes, segmentIndex), segmentIndex = segmentIndex)
-      }
 
       /** Returns the decompressor that can decompress the segments compressed by this compressor */
-      override def createDecompressor(): Decompressor = wrapped.createDecompressor().withPredictorDecoding(predictor)
+      def createDecompressor(): Decompressor = wrapped.createDecompressor().withPredictorDecoding(predictor)
     }
 
 }
diff --git a/raster/src/main/scala/geotrellis/raster/io/geotiff/compression/FloatingPointPredictor.scala b/raster/src/main/scala/geotrellis/raster/io/geotiff/compression/FloatingPointPredictor.scala
@@ -67,7 +67,7 @@ object FloatingPointPredictor {
       bytes
     }
 
-    override def encode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] = {
+    def encode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] = {
       val rows = rowsInSegment(segmentIndex)
       val bytesPerSample = bandType.bytesPerSample
       val bytesPerRow = colsPerRow * bandCount * bytesPerSample
diff --git a/raster/src/main/scala/geotrellis/raster/io/geotiff/compression/HorizontalPredictor.scala b/raster/src/main/scala/geotrellis/raster/io/geotiff/compression/HorizontalPredictor.scala
@@ -82,10 +82,10 @@ object HorizontalPredictor {
         val code: Int = Predictor.PREDICTOR_HORIZONTAL
         val checkEndian = true
 
-        override def encode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] =
+        def encode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] =
           encodeFunc(bytes, segmentIndex)
 
-        override def decode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] =
+        def decode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] =
           decodeFunc(bytes, segmentIndex)
       }
     }
diff --git a/raster/src/main/scala/geotrellis/raster/io/geotiff/compression/Predictor.scala b/raster/src/main/scala/geotrellis/raster/io/geotiff/compression/Predictor.scala
@@ -45,8 +45,8 @@ object Predictor {
           val code: Int = PREDICTOR_NONE
           val checkEndian = true
 
-          override def encode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] = bytes
-          override def decode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] = bytes
+          def encode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] = bytes
+          def decode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] = bytes
         }
       case Some(PREDICTOR_HORIZONTAL) =>
         HorizontalPredictor(tiffTags)
diff --git a/spark/src/main/scala/geotrellis/spark/join/CartesianPartition.scala b/spark/src/main/scala/geotrellis/spark/join/CartesianPartition.scala
@@ -0,0 +1,52 @@
+package geotrellis.spark.join
+
+import org.apache.spark.Partition
+import org.apache.spark.internal.Logging
+import org.apache.spark.rdd.RDD
+
+import java.io.{IOException, ObjectOutputStream}
+import scala.util.control.NonFatal
+
+// https://github.com/apache/spark/blob/686d84453610e463df7df95395ce6ed36a6efacd/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala#L29
+private[join] class CartesianPartition(
+  idx: Int,
+  @transient private val rdd1: RDD[_],
+  @transient private val rdd2: RDD[_],
+  s1Index: Int,
+  s2Index: Int
+) extends Partition {
+
+  var s1 = rdd1.partitions(s1Index)
+  var s2 = rdd2.partitions(s2Index)
+  override val index: Int = idx
+
+  @throws(classOf[IOException])
+  private def writeObject(oos: ObjectOutputStream): Unit = CartesianPartition.tryOrIOException {
+    // Update the reference to parent split at the time of task serialization
+    s1 = rdd1.partitions(s1Index)
+    s2 = rdd2.partitions(s2Index)
+    oos.defaultWriteObject()
+  }
+}
+
+object CartesianPartition extends Logging {
+  /**
+   * Execute a block of code that returns a value, re-throwing any non-fatal uncaught
+   * exceptions as IOException. This is used when implementing Externalizable and Serializable's
+   * read and write methods, since Java's serializer will not report non-IOExceptions properly;
+   * see SPARK-4080 for more context.
+   */
+  // https://github.com/apache/spark/blob/686d84453610e463df7df95395ce6ed36a6efacd/common/utils/src/main/scala/org/apache/spark/util/SparkErrorUtils.scala#L35
+  private def tryOrIOException[T](block: => T): T = {
+    try {
+      block
+    } catch {
+      case e: IOException =>
+        logError("Exception encountered", e)
+        throw e
+      case NonFatal(e) =>
+        logError("Exception encountered", e)
+        throw new IOException(e)
+    }
+  }
+}
diff --git a/spark/src/main/scala/geotrellis/spark/join/FilteredCartesianRDD.scala b/spark/src/main/scala/geotrellis/spark/join/FilteredCartesianRDD.scala
@@ -19,13 +19,13 @@
   *
   * 1. https://github.com/apache/spark/blob/2f8776ccad532fbed17381ff97d302007918b8d8/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala
   */
-package org.apache.spark.rdd
+package geotrellis.spark.join
 
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
 
 import scala.reflect.ClassTag
 
-import org.apache.spark._
-
 /** Performs a cartesian join of two RDDs using filter and refine pattern.
   *
   * During RDD declaration n*m partitions will be generated, one for each possible cartesian mapping.

Original file line number	Diff line number	Diff line change
`@@ -28,12 +28,11 @@ trait Compressor extends Serializable {`
`28`	`28`	`new Compressor {`
`29`	`29`	`def wrapped: Compressor = Compressor.this`
`30`	`30`
`31`		`- override def compress(bytes: Array[Byte], segmentIndex: Int): Array[Byte] = {`
	`31`	`+ def compress(bytes: Array[Byte], segmentIndex: Int): Array[Byte] =`
`32`	`32`	`wrapped.compress(predictor.encode(bytes, segmentIndex), segmentIndex = segmentIndex)`
`33`		`- }`
`34`	`33`
`35`	`34`	`/** Returns the decompressor that can decompress the segments compressed by this compressor */`
`36`		`- override def createDecompressor(): Decompressor = wrapped.createDecompressor().withPredictorDecoding(predictor)`
	`35`	`+ def createDecompressor(): Decompressor = wrapped.createDecompressor().withPredictorDecoding(predictor)`
`37`	`36`	`}`
`38`	`37`
`39`	`38`	`}`
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ object FloatingPointPredictor {`
`67`	`67`	`bytes`
`68`	`68`	`}`
`69`	`69`
`70`		`- override def encode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] = {`
	`70`	`+ def encode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] = {`
`71`	`71`	`val rows = rowsInSegment(segmentIndex)`
`72`	`72`	`val bytesPerSample = bandType.bytesPerSample`
`73`	`73`	`val bytesPerRow = colsPerRow * bandCount * bytesPerSample`
Original file line number	Diff line number	Diff line change
`@@ -82,10 +82,10 @@ object HorizontalPredictor {`
`82`	`82`	`val code: Int = Predictor.PREDICTOR_HORIZONTAL`
`83`	`83`	`val checkEndian = true`
`84`	`84`
`85`		`- override def encode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] =`
	`85`	`+ def encode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] =`
`86`	`86`	`encodeFunc(bytes, segmentIndex)`
`87`	`87`
`88`		`- override def decode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] =`
	`88`	`+ def decode(bytes: Array[Byte], segmentIndex: Int): Array[Byte] =`
`89`	`89`	`decodeFunc(bytes, segmentIndex)`
`90`	`90`	`}`
`91`	`91`	`}`
Original file line number	Diff line number	Diff line change
`@@ -19,13 +19,13 @@`
`19`	`19`	`*`
`20`	`20`	`* 1. https://github.com/apache/spark/blob/2f8776ccad532fbed17381ff97d302007918b8d8/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala`
`21`	`21`	`*/`
`22`		`-package org.apache.spark.rdd`
	`22`	`+package geotrellis.spark.join`
`23`	`23`
	`24`	`+import org.apache.spark._`
	`25`	`+import org.apache.spark.rdd.RDD`
`24`	`26`
`25`	`27`	`import scala.reflect.ClassTag`
`26`	`28`
`27`		`-import org.apache.spark._`
`28`		`-`
`29`	`29`	`/** Performs a cartesian join of two RDDs using filter and refine pattern.`
`30`	`30`	`*`
`31`	`31`	`* During RDD declaration n*m partitions will be generated, one for each possible cartesian mapping.`