#415 Fix PR suggestions (Thanks @coderabbitai), add new unit test cases.

yruslan · yruslan · commit 1facde12918b · 2025-08-19T10:34:05.000+02:00
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala
@@ -65,6 +65,7 @@ class DefaultSource
 
   /** Writer relation */
   override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = {
+    val outSqlContext = sqlContext
     val path = parameters.getOrElse("path",
       throw new IllegalArgumentException("Path is required for this data source."))
 
@@ -73,27 +74,42 @@ class DefaultSource
 
     val readerParameters = CobolParametersParser.getReaderProperties(cobolParameters, None)
 
+    val outputPath = new Path(path)
+    val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
+    val fs = outputPath.getFileSystem(hadoopConf)
+
     mode match {
       case SaveMode.Overwrite =>
-        val outputPath = new Path(path)
-        val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
-        val fs = outputPath.getFileSystem(hadoopConf)
         if (fs.exists(outputPath)) {
           fs.delete(outputPath, true)
         }
       case SaveMode.Append =>
-        throw new IllegalArgumentException(
-          s"Save mode '$mode' is not supported by the 'spark-cobol' data source at the moment. " +
-            "Please use 'Overwrite' mode to write data to a file or folder."
-        )
+        if (fs.exists(outputPath)) {
+          throw new IllegalArgumentException(
+            s"Save mode '$mode' is not supported by the 'spark-cobol' data source at the moment. " +
+              "Please use 'Overwrite' mode to write data to a file or folder."
+          )
+        }
+      case SaveMode.ErrorIfExists =>
+        if (fs.exists(outputPath)) {
+          throw new IllegalArgumentException(
+            s"Path '$path' already exists; SaveMode.ErrorIfExists prevents overwriting."
+          )
+        }
+      case SaveMode.Ignore =>
+        if (fs.exists(outputPath)) {
+          // Skip the write entirely
+          return new BaseRelation {
+            override val sqlContext: SQLContext = outSqlContext
+            override def schema: StructType = data.schema
+          }
+        }
       case _ =>
     }
 
     val copybookContent = CopybookContentLoader.load(cobolParameters, sqlContext.sparkContext.hadoopConfiguration)
     val cobolSchema = CobolSchema.fromReaderParameters(copybookContent, readerParameters)
-
     val combiner = RecordCombinerSelector.selectCombiner(cobolSchema, readerParameters)
-
     val rdd = combiner.combine(data, cobolSchema, readerParameters)
 
     rdd.map(bytes => (NullWritable.get(), new BytesWritable(bytes)))
@@ -105,8 +121,7 @@ class DefaultSource
       )
 
     new BaseRelation {
-      override def sqlContext: SQLContext = sqlContext
-
+      override def sqlContext: SQLContext = outSqlContext
       override def schema: StructType = data.schema
     }
   }
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/BasicRecordCombiner.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/BasicRecordCombiner.scala
@@ -36,26 +36,28 @@ class BasicRecordCombiner extends RecordCombiner {
       }
     )
 
-    val sparkFieldPositions = cobolFields.map { cobolField =>
+    val sparkFieldPositions = cobolFields.zipWithIndex.map { case (cobolField, idx) =>
       val fieldName = cobolField.name.toLowerCase
       val position = sparkFields.indexOf(fieldName)
 
       if (position < 0) {
         throw new IllegalArgumentException(s"Field '${cobolField.name}' from the copybook is not found in the DataFrame schema.")
       }
 
-      position
+      (idx, position)
     }
 
     val size = cobolSchema.getRecordSize
 
     df.rdd.map { row =>
       val ar = new Array[Byte](size)
 
-      sparkFieldPositions.foreach { index =>
-        val fieldStr = row.get(index)
-        val cobolField = cobolFields(index)
-        cobolSchema.copybook.setPrimitiveField(cobolField, ar, fieldStr, 0)
+      sparkFieldPositions.foreach { case (cobolIdx, sparkIdx) =>
+        if (!row.isNullAt(sparkIdx)) {
+          val fieldStr = row.get(sparkIdx)
+          val cobolField = cobolFields(cobolIdx)
+          cobolSchema.copybook.setPrimitiveField(cobolField, ar, fieldStr, 0)
+        }
       }
 
       ar
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala
@@ -23,6 +23,7 @@ import za.co.absa.cobrix.spark.cobol.source.base.SparkTestBase
 import za.co.absa.cobrix.spark.cobol.source.fixtures.BinaryFileFixture
 
 class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with BinaryFileFixture {
+
   import spark.implicits._
 
   private val copybookContents =
@@ -37,7 +38,7 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
         val df = List(("A", "First"), ("B", "Scnd"), ("C", "Last")).toDF("A", "B")
 
         val path = new Path(tempDir, "writer1")
-        
+
         df.repartition(1)
           .orderBy("A")
           .write
@@ -74,6 +75,117 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
         }
       }
     }
+
+    "write data frames with different field order and null values" in {
+      withTempDirectory("cobol_writer1") { tempDir =>
+        val df = List((1, "First", "A"), (2, "Scnd", "B"), (3, null, "C")).toDF("C", "B", "A")
+
+        val path = new Path(tempDir, "writer1")
+
+        df.repartition(1)
+          .orderBy("A")
+          .write
+          .format("cobol")
+          .mode(SaveMode.Overwrite)
+          .option("copybook_contents", copybookContents)
+          .save(path.toString)
+
+        val fs = path.getFileSystem(spark.sparkContext.hadoopConfiguration)
+
+        assert(fs.exists(path), "Output directory should exist")
+        val files = fs.listStatus(path)
+          .filter(_.getPath.getName.startsWith("part-"))
+        assert(files.nonEmpty, "Output directory should contain part files")
+
+        val partFile = files.head.getPath
+        val data = fs.open(partFile)
+        val bytes = new Array[Byte](files.head.getLen.toInt)
+        data.readFully(bytes)
+        data.close()
+
+        // Expected EBCDIC data for sample test data
+        val expected = Array[Byte](
+          0xC1.toByte, 0xC6.toByte, 0x89.toByte, 0x99.toByte, 0xa2.toByte, 0xa3.toByte, // A,First
+          0xC2.toByte, 0xE2.toByte, 0x83.toByte, 0x95.toByte, 0x84.toByte, 0x00.toByte, // B,Scnd_
+          0xC3.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte // C,Last_
+        )
+
+        if (!bytes.sameElements(expected)) {
+          println(s"Expected bytes: ${expected.map("%02X" format _).mkString(" ")}")
+          println(s"Actual bytes:   ${bytes.map("%02X" format _).mkString(" ")}")
+
+          assert(bytes.sameElements(expected), "Written data should match expected EBCDIC encoding")
+        }
+      }
+    }
+
+    "write should fail with save mode append and the path exists" in {
+      withTempDirectory("cobol_writer3") { tempDir =>
+        val df = List(("A", "First"), ("B", "Scnd"), ("C", "Last")).toDF("A", "B")
+
+        val path = new Path(tempDir, "writer2")
+
+        df.write
+          .format("cobol")
+          .mode(SaveMode.Append)
+          .option("copybook_contents", copybookContents)
+          .save(path.toString)
+
+        assertThrows[IllegalArgumentException] {
+          df.write
+            .format("cobol")
+            .mode(SaveMode.Append)
+            .option("copybook_contents", copybookContents)
+            .save(path.toString)
+        }
+      }
+    }
+
+    "write should fail with save mode fail if exists and the path exists" in {
+      withTempDirectory("cobol_writer3") { tempDir =>
+        val df = List(("A", "First"), ("B", "Scnd"), ("C", "Last")).toDF("A", "B")
+
+        val path = new Path(tempDir, "writer2")
+
+        df.write
+          .format("cobol")
+          .mode(SaveMode.ErrorIfExists)
+          .option("copybook_contents", copybookContents)
+          .save(path.toString)
+
+        assertThrows[IllegalArgumentException] {
+          df.write
+            .format("cobol")
+            .mode(SaveMode.ErrorIfExists)
+            .option("copybook_contents", copybookContents)
+            .save(path.toString)
+        }
+      }
+    }
+
+    "write should be ignored when save mode is ignore" in {
+      withTempDirectory("cobol_writer3") { tempDir =>
+        val df = List(("A", "First"), ("B", "Scnd"), ("C", "Last")).toDF("A", "B")
+
+        val path = new Path(tempDir, "writer2")
+
+        df.write
+          .format("cobol")
+          .mode(SaveMode.Ignore)
+          .option("copybook_contents", copybookContents)
+          .save(path.toString)
+
+        df.write
+          .format("cobol")
+          .mode(SaveMode.Ignore)
+          .option("copybook_contents", copybookContents)
+          .save(path.toString)
+
+        val fs = path.getFileSystem(spark.sparkContext.hadoopConfiguration)
+        assert(fs.exists(path), "Output directory should exist")
+      }
+    }
+
   }
 
 }

Original file line number	Diff line number	Diff line change
`@@ -36,26 +36,28 @@ class BasicRecordCombiner extends RecordCombiner {`
`36`	`36`	`}`
`37`	`37`	`)`
`38`	`38`
`39`		`- val sparkFieldPositions = cobolFields.map { cobolField =>`
	`39`	`+ val sparkFieldPositions = cobolFields.zipWithIndex.map { case (cobolField, idx) =>`
`40`	`40`	`val fieldName = cobolField.name.toLowerCase`
`41`	`41`	`val position = sparkFields.indexOf(fieldName)`
`42`	`42`
`43`	`43`	`if (position < 0) {`
`44`	`44`	`throw new IllegalArgumentException(s"Field '${cobolField.name}' from the copybook is not found in the DataFrame schema.")`
`45`	`45`	`}`
`46`	`46`
`47`		`- position`
	`47`	`+ (idx, position)`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`val size = cobolSchema.getRecordSize`
`51`	`51`
`52`	`52`	`df.rdd.map { row =>`
`53`	`53`	`val ar = new Array[Byte](size)`
`54`	`54`
`55`		`- sparkFieldPositions.foreach { index =>`
`56`		`- val fieldStr = row.get(index)`
`57`		`- val cobolField = cobolFields(index)`
`58`		`- cobolSchema.copybook.setPrimitiveField(cobolField, ar, fieldStr, 0)`
	`55`	`+ sparkFieldPositions.foreach { case (cobolIdx, sparkIdx) =>`
	`56`	`+ if (!row.isNullAt(sparkIdx)) {`
	`57`	`+ val fieldStr = row.get(sparkIdx)`
	`58`	`+ val cobolField = cobolFields(cobolIdx)`
	`59`	`+ cobolSchema.copybook.setPrimitiveField(cobolField, ar, fieldStr, 0)`
	`60`	`+ }`
`59`	`61`	`}`
`60`	`62`
`61`	`63`	`ar`