#776 Implement BCD encoders in the COBOL data writer.

yruslan · yruslan · commit 061c9289d6c3 · 2025-08-22T08:43:42.000+02:00
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/Copybook.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/Copybook.scala
@@ -26,6 +26,7 @@ import scala.collection.mutable.ArrayBuffer
 
 
 class Copybook(val ast: CopybookAST) extends Logging with Serializable {
+  import Copybook._
 
   def getCobolSchema: CopybookAST = ast
 
@@ -215,38 +216,6 @@ class Copybook(val ast: CopybookAST) extends Logging with Serializable {
     field.decodeTypeValue(0, slicedBytes)
   }
 
-  /**
-    * Set value of a field of the copybook record by the AST object of the field
-    *
-    * Nested field names can contain '.' to identify the exact field.
-    * If the field name is unique '.' is not required.
-    *
-    * @param field The AST object of the field
-    * @param bytes Binary encoded data of the record
-    * @param startOffset An offset to the beginning of the field in the data (in bytes).
-    * @return The value of the field
-    *
-    */
-  def setPrimitiveField(field: Primitive, recordBytes: Array[Byte], value: Any, startOffset: Int = 0): Unit = {
-    field.encode match {
-      case Some(encode) =>
-        val fieldBytes = encode(value)
-        val startByte = field.binaryProperties.offset + startOffset
-        val endByte = field.binaryProperties.offset + startOffset + field.binaryProperties.actualSize
-
-        if (startByte < 0 || endByte > recordBytes.length) {
-          throw new IllegalArgumentException(s"Cannot set value for field '${field.name}' because the field is out of bounds of the record.")
-        }
-        if (fieldBytes.length != field.binaryProperties.dataSize) {
-          throw new IllegalArgumentException(s"Cannot set value for field '${field.name}' because the encoded value has a different size than the field size.")
-        }
-
-        System.arraycopy(fieldBytes, 0, recordBytes, startByte, fieldBytes.length)
-      case None =>
-        throw new IllegalStateException(s"Cannot set value for field '${field.name}' because it does not have an encoder defined.")
-    }
-  }
-
   /** This routine is used for testing by generating a layout position information to compare with mainframe output */
   def generateRecordLayoutPositions(): String = {
     var fieldCounter: Int = 0
@@ -442,4 +411,36 @@ object Copybook {
 
     new Copybook(schema)
   }
+
+  /**
+    * Set value of a field of the copybook record by the AST object of the field
+    *
+    * Nested field names can contain '.' to identify the exact field.
+    * If the field name is unique '.' is not required.
+    *
+    * @param field       The AST object of the field
+    * @param recordBytes Binary encoded data of the record
+    * @param startOffset An offset to the beginning of the field in the data (in bytes).
+    * @return The value of the field
+    *
+    */
+  def setPrimitiveField(field: Primitive, recordBytes: Array[Byte], value: Any, startOffset: Int = 0): Unit = {
+    field.encode match {
+      case Some(encode) =>
+        val fieldBytes = encode(value)
+        val startByte = field.binaryProperties.offset + startOffset
+        val endByte = field.binaryProperties.offset + startOffset + field.binaryProperties.actualSize
+
+        if (startByte < 0 || endByte > recordBytes.length) {
+          throw new IllegalArgumentException(s"Cannot set value for field '${field.name}' because the field is out of bounds of the record.")
+        }
+        if (fieldBytes.length != field.binaryProperties.dataSize) {
+          throw new IllegalArgumentException(s"Cannot set value for field '${field.name}' because the encoded value has a different size than the field size.")
+        }
+
+        System.arraycopy(fieldBytes, 0, recordBytes, startByte, fieldBytes.length)
+      case None =>
+        throw new IllegalStateException(s"Cannot set value for field '${field.name}' because it does not have an encoder defined.")
+    }
+  }
 }
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/EncoderSelector.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/EncoderSelector.scala
@@ -16,7 +16,7 @@
 
 package za.co.absa.cobrix.cobol.parser.encoding
 
-import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, CobolType}
+import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, COMP3, COMP3U, CobolType, Decimal, Integral}
 import za.co.absa.cobrix.cobol.parser.encoding.codepage.{CodePage, CodePageCommon}
 
 import java.nio.charset.{Charset, StandardCharsets}
@@ -31,6 +31,14 @@ object EncoderSelector {
     dataType match {
       case alphaNumeric: AlphaNumeric if alphaNumeric.compact.isEmpty =>
         getStringEncoder(alphaNumeric.enc.getOrElse(EBCDIC), ebcdicCodePage, asciiCharset, alphaNumeric.length)
+      case integralComp3: Integral if integralComp3.compact.exists(_.isInstanceOf[COMP3]) =>
+        Option(getBdcEncoder(integralComp3.precision, 0, 0, integralComp3.signPosition.isDefined, mandatorySignNibble = true))
+      case integralComp3: Integral if integralComp3.compact.exists(_.isInstanceOf[COMP3U]) =>
+        Option(getBdcEncoder(integralComp3.precision, 0, 0, integralComp3.signPosition.isDefined, mandatorySignNibble = false))
+      case decimalComp3: Decimal if decimalComp3.compact.exists(_.isInstanceOf[COMP3]) =>
+        Option(getBdcEncoder(decimalComp3.precision, decimalComp3.scale, decimalComp3.scaleFactor, decimalComp3.signPosition.isDefined, mandatorySignNibble = true))
+      case decimalComp3: Decimal if decimalComp3.compact.exists(_.isInstanceOf[COMP3U]) =>
+        Option(getBdcEncoder(decimalComp3.precision, decimalComp3.scale, decimalComp3.scaleFactor, decimalComp3.signPosition.isDefined, mandatorySignNibble = false))
       case _ =>
         None
     }
@@ -80,4 +88,26 @@ object EncoderSelector {
     buf
   }
 
+  def getBdcEncoder(precision: Int,
+                    scale: Int,
+                    scaleFactor: Int,
+                    signed: Boolean,
+                    mandatorySignNibble: Boolean): Encoder = {
+    if (signed && !mandatorySignNibble)
+      throw new IllegalArgumentException("If signed is true, mandatorySignNibble must also be true.")
+
+    (a: Any) => {
+      val number = a match {
+        case null => null
+        case d: java.math.BigDecimal => d
+        case n: java.math.BigInteger => new java.math.BigDecimal(n)
+        case n: Byte => new java.math.BigDecimal(n)
+        case n: Int => new java.math.BigDecimal(n)
+        case n: Long => new java.math.BigDecimal(n)
+        case x => new java.math.BigDecimal(x.toString)
+      }
+      BCDNumberEncoders.encodeBCDNumber(number, precision, scale, scaleFactor, signed, mandatorySignNibble)
+    }
+  }
+
 }
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/extract/BinaryExtractorSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/extract/BinaryExtractorSpec.scala
@@ -216,7 +216,7 @@ class BinaryExtractorSpec extends AnyFunSuite {
     val fieldName2: String = "COMPANY.COMPANY-ID-NUM"
     val fields2 = copybook.getFieldByName(fieldName2)
     assert(fields2.isInstanceOf[Primitive])
-    assert(fields2.asInstanceOf[Primitive].encode.isEmpty)
+    assert(fields2.asInstanceOf[Primitive].encode.nonEmpty)
   }
 
   test("Test padding when setting field value by name") {
@@ -230,7 +230,7 @@ class BinaryExtractorSpec extends AnyFunSuite {
     val fieldName2: String = "COMPANY.COMPANY-ID-NUM"
     val fields2 = copybook2.getFieldByName(fieldName2)
     assert(fields2.isInstanceOf[Primitive])
-    assert(fields2.asInstanceOf[Primitive].encode.isEmpty)
+    assert(fields2.asInstanceOf[Primitive].encode.nonEmpty)
   }
 
   test("Test truncating when setting field value by name") {
@@ -244,6 +244,6 @@ class BinaryExtractorSpec extends AnyFunSuite {
     val fieldName2: String = "COMPANY.COMPANY-ID-NUM"
     val fields2 = copybook2.getFieldByName(fieldName2)
     assert(fields2.isInstanceOf[Primitive])
-    assert(fields2.asInstanceOf[Primitive].encode.isEmpty)
+    assert(fields2.asInstanceOf[Primitive].encode.nonEmpty)
   }
 }
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/BasicRecordCombiner.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/BasicRecordCombiner.scala
@@ -18,17 +18,24 @@ package za.co.absa.cobrix.spark.cobol.writer
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import za.co.absa.cobrix.cobol.parser.Copybook
 import za.co.absa.cobrix.cobol.parser.ast.{Group, Primitive, Statement}
 import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
 import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
+import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, COMP3, COMP3U, CobolType, Decimal, Integral}
+
+import java.io.{ByteArrayOutputStream, ObjectOutputStream}
 
 class BasicRecordCombiner extends RecordCombiner {
+
+  import BasicRecordCombiner._
+
   override def combine(df: DataFrame, cobolSchema: CobolSchema, readerParameters: ReaderParameters): RDD[Array[Byte]] = {
     val ast = getAst(cobolSchema)
     val copybookFields = ast.children.filter {
       case p: Primitive => !p.isFiller
-      case g: Group     => !g.isFiller
-      case _            => true
+      case g: Group => !g.isFiller
+      case _ => true
     }
 
     validateSchema(df, copybookFields.toSeq)
@@ -38,7 +45,9 @@ class BasicRecordCombiner extends RecordCombiner {
 
     cobolFields.foreach(cobolField =>
       if (cobolField.encode.isEmpty) {
-        throw new IllegalArgumentException(s"Field '${cobolField.name}' does not have an encoding defined in the copybook. 'PIC ${cobolField.dataType.originalPic}' is not yet supported.")
+        val fieldDefinition = getFieldDefinition(cobolField)
+        throw new IllegalArgumentException(s"Field '${cobolField.name}' does not have an encoding defined in the copybook. " +
+          s"'PIC $fieldDefinition' is not yet supported.")
       }
     )
 
@@ -62,7 +71,7 @@ class BasicRecordCombiner extends RecordCombiner {
         if (!row.isNullAt(sparkIdx)) {
           val fieldStr = row.get(sparkIdx)
           val cobolField = cobolFields(cobolIdx)
-          cobolSchema.copybook.setPrimitiveField(cobolField, ar, fieldStr, 0)
+          Copybook.setPrimitiveField(cobolField, ar, fieldStr, 0)
         }
       }
 
@@ -107,3 +116,17 @@ class BasicRecordCombiner extends RecordCombiner {
     }
   }
 }
+
+object BasicRecordCombiner {
+  def getFieldDefinition(field: Primitive): String = {
+    val pic = field.dataType.originalPic.getOrElse(field.dataType.pic)
+
+    val usage = field.dataType match {
+      case dt: Integral => dt.compact.map(_.toString).getOrElse("USAGE IS DISPLAY")
+      case dt: Decimal => dt.compact.map(_.toString).getOrElse("USAGE IS DISPLAY")
+      case _ => ""
+    }
+
+    s"$pic $usage".trim
+  }
+}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala
@@ -126,6 +126,64 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
       }
     }
 
+    "write data frames with COMP-3 fields" in {
+      withTempDirectory("cobol_writer1") { tempDir =>
+        val df = List(
+          (1, 100.5, new java.math.BigDecimal(10.23), 1, 100.5, new java.math.BigDecimal(10.12)),
+          (2, 800.4, new java.math.BigDecimal(30), 2, 800.4, new java.math.BigDecimal(30)),
+          (3, 22.33, new java.math.BigDecimal(-20), 3, 22.33, new java.math.BigDecimal(-20))
+        ).toDF("A", "B", "C", "D", "E", "F")
+
+        val path = new Path(tempDir, "writer1")
+
+        val copybookContentsWithFilers =
+          """       01  RECORD.
+           05  A       PIC S9(1)      COMP-3.
+           05  B       PIC 9(4)V9(2)  COMP-3.
+           05  C       PIC S9(2)V9(2) COMP-3.
+           05  D       PIC 9(1)       COMP-3U.
+           05  E       PIC 9(4)V9(2)  COMP-3U.
+           05  F       PIC 9(2)V9(2)  COMP-3U.
+    """
+
+        df.coalesce(1)
+          .orderBy("A")
+          .write
+          .format("cobol")
+          .mode(SaveMode.Overwrite)
+          .option("copybook_contents", copybookContentsWithFilers)
+          .save(path.toString)
+
+        val fs = path.getFileSystem(spark.sparkContext.hadoopConfiguration)
+
+        assert(fs.exists(path), "Output directory should exist")
+        val files = fs.listStatus(path)
+          .filter(_.getPath.getName.startsWith("part-"))
+        assert(files.nonEmpty, "Output directory should contain part files")
+
+        val partFile = files.head.getPath
+        val data = fs.open(partFile)
+        val bytes = new Array[Byte](files.head.getLen.toInt)
+        data.readFully(bytes)
+        data.close()
+
+        // Expected EBCDIC data for sample test data
+        val expected = Array(
+          0x1C, 0x00, 0x10, 0x05, 0x0F, 0x01, 0x02, 0x3C, 0x01, 0x01, 0x00, 0x50, 0x10, 0x12,
+          0x2C, 0x00, 0x80, 0x04, 0x0F, 0x03, 0x00, 0x0C, 0x02, 0x08, 0x00, 0x40, 0x30, 0x00,
+          0x3C, 0x00, 0x02, 0x23, 0x3F, 0x02, 0x00, 0x0D, 0x03, 0x00, 0x22, 0x33, 0x00, 0x00
+        ).map(_.toByte)
+
+        if (!bytes.sameElements(expected)) {
+          println(s"Expected bytes: ${expected.map("%02X" format _).mkString(" ")}")
+          println(s"Actual bytes:   ${bytes.map("%02X" format _).mkString(" ")}")
+
+          assert(bytes.sameElements(expected), "Written data should match expected EBCDIC encoding")
+        }
+      }
+    }
+
+
     "write should fail with save mode append and the path exists" in {
       withTempDirectory("cobol_writer3") { tempDir =>
         val df = List(("A", "First"), ("B", "Scnd"), ("C", "Last")).toDF("A", "B")

Original file line number	Diff line number	Diff line change
`@@ -216,7 +216,7 @@ class BinaryExtractorSpec extends AnyFunSuite {`
`216`	`216`	`val fieldName2: String = "COMPANY.COMPANY-ID-NUM"`
`217`	`217`	`val fields2 = copybook.getFieldByName(fieldName2)`
`218`	`218`	`assert(fields2.isInstanceOf[Primitive])`
`219`		`- assert(fields2.asInstanceOf[Primitive].encode.isEmpty)`
	`219`	`+ assert(fields2.asInstanceOf[Primitive].encode.nonEmpty)`
`220`	`220`	`}`
`221`	`221`
`222`	`222`	`test("Test padding when setting field value by name") {`
`@@ -230,7 +230,7 @@ class BinaryExtractorSpec extends AnyFunSuite {`
`230`	`230`	`val fieldName2: String = "COMPANY.COMPANY-ID-NUM"`
`231`	`231`	`val fields2 = copybook2.getFieldByName(fieldName2)`
`232`	`232`	`assert(fields2.isInstanceOf[Primitive])`
`233`		`- assert(fields2.asInstanceOf[Primitive].encode.isEmpty)`
	`233`	`+ assert(fields2.asInstanceOf[Primitive].encode.nonEmpty)`
`234`	`234`	`}`
`235`	`235`
`236`	`236`	`test("Test truncating when setting field value by name") {`
`@@ -244,6 +244,6 @@ class BinaryExtractorSpec extends AnyFunSuite {`
`244`	`244`	`val fieldName2: String = "COMPANY.COMPANY-ID-NUM"`
`245`	`245`	`val fields2 = copybook2.getFieldByName(fieldName2)`
`246`	`246`	`assert(fields2.isInstanceOf[Primitive])`
`247`		`- assert(fields2.asInstanceOf[Primitive].encode.isEmpty)`
	`247`	`+ assert(fields2.asInstanceOf[Primitive].encode.nonEmpty)`
`248`	`248`	`}`
`249`	`249`	`}`