#415 Fix valuable nitpick PR suggestions from @coderabbitai.

yruslan · yruslan · commit 01e87094b03f · 2025-08-19T11:24:33.000+02:00
diff --git a/README.md b/README.md
@@ -1691,6 +1691,15 @@ The writer is still in its early stages and has several limitations:
 - Save mode `append` is not supported; only `overwrite` is.
 - Partitioning by DataFrame fields is not supported.
 
+### Implementation details
+Handling of `PIC X(n)`:
+- Values are truncated when longer than n and right-padded when shorter.
+- The padding byte is EBCDIC space `0x40`.
+- `null` values in DataFrames are written as `0x00` bytes.
+
+Handling of `FILLES`s
+- FILLER record spaces are populated with `0x00` bytes.
+
 ## Performance Analysis
 
 Performance tests were performed on synthetic datasets. The setup and results are as follows.
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/EncoderSelector.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/EncoderSelector.scala
@@ -20,6 +20,7 @@ import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, CobolType}
 import za.co.absa.cobrix.cobol.parser.encoding.codepage.{CodePage, CodePageCommon}
 
 import java.nio.charset.{Charset, StandardCharsets}
+import java.util
 
 object EncoderSelector {
   type Encoder = Any => Array[Byte]
@@ -35,7 +36,7 @@ object EncoderSelector {
     }
   }
 
-  /** Gets a decoder function for a string data type. Encoder is chosen depending on whether input encoding is EBCDIC or ASCII */
+  /** Gets an encoder function for a string data type. The encoder is chosen depending on whether the output encoding is EBCDIC or ASCII. */
   private def getStringEncoder(encoding: Encoding,
                                ebcdicCodePage: CodePage,
                                asciiCharset: Charset,
@@ -68,6 +69,9 @@ object EncoderSelector {
     var i = 0
     val buf = new Array[Byte](length)
 
+    // PIC X fields are space-filled on mainframe. Use EBCDIC space 0x40.
+    util.Arrays.fill(buf, 0x40.toByte)
+
     while (i < string.length && i < length) {
       val asciiByte = string(i).toByte
       buf(i) = conversionTable((asciiByte + 256) % 256)
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala
@@ -247,12 +247,24 @@ object CobolParametersParser extends Logging {
     }
 
     val copybookPaths = params.get(PARAM_MULTI_COPYBOOK_PATH) match {
-      case Some(paths) => paths.split(',').toSeq
+      case Some(paths) =>
+        paths.split(',')
+          .iterator
+          .map(_.trim)
+          .filter(_.nonEmpty)
+          .toSeq
       case None => Seq.empty[String]
     }
 
+    val copybookPathOpt = getParameter(PARAM_COPYBOOK_PATH, params)
+    if (copybookPathOpt.nonEmpty && copybookPaths.nonEmpty) {
+      throw new IllegalArgumentException(
+        s"Options '$PARAM_COPYBOOK_PATH' (single path) and '$PARAM_MULTI_COPYBOOK_PATH' (comma-separated list) are mutually exclusive. Use only one."
+      )
+    }
+
     val cobolParameters = CobolParameters(
-      getParameter(PARAM_COPYBOOK_PATH, params),
+      copybookPathOpt,
       copybookPaths,
       getParameter(PARAM_COPYBOOK_CONTENTS, params),
       paths,
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/extract/BinaryExtractorSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/extract/BinaryExtractorSpec.scala
@@ -22,6 +22,7 @@ import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, CobolType}
 import za.co.absa.cobrix.cobol.parser.ast.{BinaryProperties, Group, Primitive}
 import za.co.absa.cobrix.cobol.parser.decoders.DecoderSelector
 import za.co.absa.cobrix.cobol.parser.encoding.{EBCDIC, EncoderSelector}
+import za.co.absa.cobrix.cobol.parser.policies.StringTrimmingPolicy
 
 class BinaryExtractorSpec extends AnyFunSuite {
 
@@ -217,4 +218,32 @@ class BinaryExtractorSpec extends AnyFunSuite {
     assert(fields2.isInstanceOf[Primitive])
     assert(fields2.asInstanceOf[Primitive].encode.isEmpty)
   }
+
+  test("Test padding when setting field value by name") {
+    val fieldName1: String = "COMPANY.SHORT-NAME"
+    val newValue1: String = "NEWN"
+    val copybook2 = CopybookParser.parseTree(copyBookContents, stringTrimmingPolicy = StringTrimmingPolicy.KeepAll)
+    copybook2.setFieldValueByName(fieldName1, bytes, newValue1, startOffset)
+    val result1: Any = copybook2.getFieldValueByName(fieldName1, bytes, startOffset)
+    assert(result1.asInstanceOf[String] === "NEWN      ")
+
+    val fieldName2: String = "COMPANY.COMPANY-ID-NUM"
+    val fields2 = copybook2.getFieldByName(fieldName2)
+    assert(fields2.isInstanceOf[Primitive])
+    assert(fields2.asInstanceOf[Primitive].encode.isEmpty)
+  }
+
+  test("Test truncating when setting field value by name") {
+    val fieldName1: String = "COMPANY.SHORT-NAME"
+    val newValue1: String = "NEWNAME_TEST123345"
+    val copybook2 = CopybookParser.parseTree(copyBookContents, stringTrimmingPolicy = StringTrimmingPolicy.KeepAll)
+    copybook2.setFieldValueByName(fieldName1, bytes, newValue1, startOffset)
+    val result1: Any = copybook2.getFieldValueByName(fieldName1, bytes, startOffset)
+    assert(result1.asInstanceOf[String] === "NEWNAME_TE")
+
+    val fieldName2: String = "COMPANY.COMPANY-ID-NUM"
+    val fields2 = copybook2.getFieldByName(fieldName2)
+    assert(fields2.isInstanceOf[Primitive])
+    assert(fields2.asInstanceOf[Primitive].encode.isEmpty)
+  }
 }
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/BasicRecordCombiner.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/BasicRecordCombiner.scala
@@ -18,16 +18,22 @@ package za.co.absa.cobrix.spark.cobol.writer
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
-import za.co.absa.cobrix.cobol.parser.ast.{Group, Primitive}
+import za.co.absa.cobrix.cobol.parser.ast.{Group, Primitive, Statement}
 import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
 import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
 
 class BasicRecordCombiner extends RecordCombiner {
   override def combine(df: DataFrame, cobolSchema: CobolSchema, readerParameters: ReaderParameters): RDD[Array[Byte]] = {
     val ast = getAst(cobolSchema)
-    validateSchema(df, ast)
+    val copybookFields = ast.children.filter {
+      case p: Primitive => !p.isFiller
+      case g: Group     => !g.isFiller
+      case _            => true
+    }
+
+    validateSchema(df, copybookFields.toSeq)
 
-    val cobolFields = ast.children.map(_.asInstanceOf[Primitive])
+    val cobolFields = copybookFields.map(_.asInstanceOf[Primitive])
     val sparkFields = df.schema.fields.map(_.name.toLowerCase)
 
     cobolFields.foreach(cobolField =>
@@ -64,10 +70,10 @@ class BasicRecordCombiner extends RecordCombiner {
     }
   }
 
-  private def validateSchema(df: DataFrame, ast: Group): Unit = {
+  private def validateSchema(df: DataFrame, copybookFields: Seq[Statement]): Unit = {
     val dfFields = df.schema.fields.map(_.name.toLowerCase).toSet
 
-    val notFoundFields = ast.children.flatMap { field =>
+    val notFoundFields = copybookFields.flatMap { field =>
       if (dfFields.contains(field.name.toLowerCase)) {
         None
       } else {
@@ -79,7 +85,7 @@ class BasicRecordCombiner extends RecordCombiner {
       throw new IllegalArgumentException(s"The following fields from the copybook are not found in the DataFrame: ${notFoundFields.mkString(", ")}")
     }
 
-    val unsupportedDataTypeFields = ast.children.filter { field =>
+    val unsupportedDataTypeFields = copybookFields.filter { field =>
       field.isInstanceOf[Group] ||
         (field.isInstanceOf[Primitive] && field.asInstanceOf[Primitive].occurs.isDefined) ||
         field.redefines.nonEmpty
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RawBinaryOutputFormat.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RawBinaryOutputFormat.scala
@@ -40,7 +40,8 @@ import java.io.DataOutputStream
 
 class RawBinaryOutputFormat extends FileOutputFormat[NullWritable, BytesWritable] {
   override def getRecordWriter(context: TaskAttemptContext): RecordWriter[NullWritable, BytesWritable] = {
-    val path: Path = getDefaultWorkFile(context, ".dat")
+    val extension = context.getConfiguration.get("cobol.writer.output.extension", ".dat")
+    val path: Path = getDefaultWorkFile(context, extension)
     val fs = path.getFileSystem(context.getConfiguration)
     val out: DataOutputStream = fs.create(path, false)
 
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RecordCombinerSelector.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RecordCombinerSelector.scala
@@ -27,7 +27,7 @@ object RecordCombinerSelector {
     * Currently, only basic fixed record length combiner is implemented. 
     * This method is to be extended as writing capabilities evolve.
     *
-    * @param cobolSchema      The COBOL schema ot output record.
+    * @param cobolSchema      The COBOL schema of the output record.
     * @param readerParameters Configuration parameters that specify how records should be formed.
     * @return A `RecordCombiner` implementation suitable for combining records based on the given schema and parameters.
     */
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala
@@ -39,7 +39,7 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
 
         val path = new Path(tempDir, "writer1")
 
-        df.repartition(1)
+        df.coalesce(1)
           .orderBy("A")
           .write
           .format("cobol")
@@ -63,8 +63,8 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
         // Expected EBCDIC data for sample test data
         val expected = Array[Byte](
           0xC1.toByte, 0xC6.toByte, 0x89.toByte, 0x99.toByte, 0xa2.toByte, 0xa3.toByte, // A,First
-          0xC2.toByte, 0xE2.toByte, 0x83.toByte, 0x95.toByte, 0x84.toByte, 0x00.toByte, // B,Scnd_
-          0xC3.toByte, 0xD3.toByte, 0x81.toByte, 0xa2.toByte, 0xa3.toByte, 0x00.toByte // C,Last_
+          0xC2.toByte, 0xE2.toByte, 0x83.toByte, 0x95.toByte, 0x84.toByte, 0x40.toByte, // B,Scnd_
+          0xC3.toByte, 0xD3.toByte, 0x81.toByte, 0xa2.toByte, 0xa3.toByte, 0x40.toByte // C,Last_
         )
 
         if (!bytes.sameElements(expected)) {
@@ -82,12 +82,19 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
 
         val path = new Path(tempDir, "writer1")
 
-        df.repartition(1)
+        val copybookContentsWithFilers =
+          """       01  RECORD.
+           05  A       PIC X(1).
+           05  FILLER  PIC X(1).
+           05  B       PIC X(5).
+    """
+
+        df.coalesce(1)
           .orderBy("A")
           .write
           .format("cobol")
           .mode(SaveMode.Overwrite)
-          .option("copybook_contents", copybookContents)
+          .option("copybook_contents", copybookContentsWithFilers)
           .save(path.toString)
 
         val fs = path.getFileSystem(spark.sparkContext.hadoopConfiguration)
@@ -105,9 +112,9 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
 
         // Expected EBCDIC data for sample test data
         val expected = Array[Byte](
-          0xC1.toByte, 0xC6.toByte, 0x89.toByte, 0x99.toByte, 0xa2.toByte, 0xa3.toByte, // A,First
-          0xC2.toByte, 0xE2.toByte, 0x83.toByte, 0x95.toByte, 0x84.toByte, 0x00.toByte, // B,Scnd_
-          0xC3.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte // C,Last_
+          0xC1.toByte, 0x00.toByte, 0xC6.toByte, 0x89.toByte, 0x99.toByte, 0xa2.toByte, 0xa3.toByte, // A,First
+          0xC2.toByte, 0x00.toByte, 0xE2.toByte, 0x83.toByte, 0x95.toByte, 0x84.toByte, 0x40.toByte, // B,Scnd_
+          0xC3.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte // C,Last_
         )
 
         if (!bytes.sameElements(expected)) {

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ object RecordCombinerSelector {`
`27`	`27`	`* Currently, only basic fixed record length combiner is implemented.`
`28`	`28`	`* This method is to be extended as writing capabilities evolve.`
`29`	`29`	`*`
`30`		`- * @param cobolSchema The COBOL schema ot output record.`
	`30`	`+ * @param cobolSchema The COBOL schema of the output record.`
`31`	`31`	`* @param readerParameters Configuration parameters that specify how records should be formed.`
`32`	`32`	* @return A `RecordCombiner` implementation suitable for combining records based on the given schema and parameters.
`33`	`33`	`*/`