#415 Implement a basic fixed record length record combiner.

yruslan · yruslan · commit 11042798cb13 · 2025-08-15T09:17:15.000+02:00
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala
@@ -23,7 +23,7 @@ import za.co.absa.cobrix.cobol.parser.CopybookParser.CopybookAST
 import za.co.absa.cobrix.cobol.parser.ast.datatype._
 import za.co.absa.cobrix.cobol.parser.ast.{Group, Primitive}
 import za.co.absa.cobrix.cobol.parser.common.Constants
-import za.co.absa.cobrix.cobol.parser.decoders.{DecoderSelector, EncoderSelector}
+import za.co.absa.cobrix.cobol.parser.decoders.DecoderSelector
 import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPointFormat
 import za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage
 import za.co.absa.cobrix.cobol.parser.encoding._
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/ast/Primitive.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/ast/Primitive.scala
@@ -17,7 +17,8 @@
 package za.co.absa.cobrix.cobol.parser.ast
 
 import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, CobolType, Decimal, Integral}
-import za.co.absa.cobrix.cobol.parser.decoders.{BinaryUtils, DecoderSelector, EncoderSelector}
+import za.co.absa.cobrix.cobol.parser.decoders.{BinaryUtils, DecoderSelector}
+import za.co.absa.cobrix.cobol.parser.encoding.EncoderSelector
 
 /** An abstraction of the statements describing fields of primitive data types in the COBOL copybook
   *
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/NonTerminalsAdder.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/NonTerminalsAdder.scala
@@ -20,9 +20,9 @@ import za.co.absa.cobrix.cobol.parser.CopybookParser.CopybookAST
 import za.co.absa.cobrix.cobol.parser.ast.datatype.AlphaNumeric
 import za.co.absa.cobrix.cobol.parser.ast.{Group, Primitive, Statement}
 import za.co.absa.cobrix.cobol.parser.common.Constants
-import za.co.absa.cobrix.cobol.parser.decoders.{DecoderSelector, EncoderSelector}
+import za.co.absa.cobrix.cobol.parser.decoders.DecoderSelector
 import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPointFormat
-import za.co.absa.cobrix.cobol.parser.encoding.Encoding
+import za.co.absa.cobrix.cobol.parser.encoding.{EncoderSelector, Encoding}
 import za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage
 import za.co.absa.cobrix.cobol.parser.policies.StringTrimmingPolicy.StringTrimmingPolicy
 
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/EncoderSelector.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/EncoderSelector.scala
@@ -14,11 +14,10 @@
  * limitations under the License.
  */
 
-package za.co.absa.cobrix.cobol.parser.decoders
+package za.co.absa.cobrix.cobol.parser.encoding
 
 import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, CobolType}
 import za.co.absa.cobrix.cobol.parser.encoding.codepage.{CodePage, CodePageCommon}
-import za.co.absa.cobrix.cobol.parser.encoding.{ASCII, EBCDIC, Encoding}
 
 import java.nio.charset.{Charset, StandardCharsets}
 
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/extract/BinaryExtractorSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/extract/BinaryExtractorSpec.scala
@@ -20,8 +20,8 @@ import org.scalatest.funsuite.AnyFunSuite
 import za.co.absa.cobrix.cobol.parser.CopybookParser
 import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, CobolType}
 import za.co.absa.cobrix.cobol.parser.ast.{BinaryProperties, Group, Primitive}
-import za.co.absa.cobrix.cobol.parser.decoders.{DecoderSelector, EncoderSelector}
-import za.co.absa.cobrix.cobol.parser.encoding.EBCDIC
+import za.co.absa.cobrix.cobol.parser.decoders.DecoderSelector
+import za.co.absa.cobrix.cobol.parser.encoding.{EBCDIC, EncoderSelector}
 
 class BinaryExtractorSpec extends AnyFunSuite {
 
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala
@@ -22,14 +22,14 @@ import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}
 import za.co.absa.cobrix.cobol.internal.Logging
-import za.co.absa.cobrix.cobol.reader.parameters.{CobolParameters, CobolParametersParser, Parameters}
 import za.co.absa.cobrix.cobol.reader.parameters.CobolParametersParser._
+import za.co.absa.cobrix.cobol.reader.parameters.{CobolParameters, CobolParametersParser, Parameters}
 import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
 import za.co.absa.cobrix.spark.cobol.reader._
 import za.co.absa.cobrix.spark.cobol.source.copybook.CopybookContentLoader
 import za.co.absa.cobrix.spark.cobol.source.parameters._
 import za.co.absa.cobrix.spark.cobol.utils.{BuildProperties, SparkUtils}
-import za.co.absa.cobrix.spark.cobol.writer.{BasicRecordCombiner, RawBinaryOutputFormat}
+import za.co.absa.cobrix.spark.cobol.writer.{RawBinaryOutputFormat, RecordCombinerSelector}
 
 /**
   * This class represents a Cobol data source.
@@ -82,13 +82,17 @@ class DefaultSource
           fs.delete(outputPath, true)
         }
       case SaveMode.Append =>
+        throw new IllegalArgumentException(
+          s"Save mode '$mode' is not supported by the 'spark-cobol' data source at the moment. " +
+            "Please use 'Overwrite' mode to write data to a file or folder."
+        )
       case _ =>
     }
 
     val copybookContent = CopybookContentLoader.load(cobolParameters, sqlContext.sparkContext.hadoopConfiguration)
     val cobolSchema = CobolSchema.fromReaderParameters(copybookContent, readerParameters)
 
-    val combiner = new BasicRecordCombiner
+    val combiner = RecordCombinerSelector.selectCombiner(cobolSchema, readerParameters)
 
     val rdd = combiner.combine(data, cobolSchema, readerParameters)
 
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/BasicRecordCombiner.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/BasicRecordCombiner.scala
@@ -18,21 +18,84 @@ package za.co.absa.cobrix.spark.cobol.writer
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import za.co.absa.cobrix.cobol.parser.ast.{Group, Primitive}
 import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
 import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
 
-import java.util
-import scala.util.Random
-
 class BasicRecordCombiner extends RecordCombiner {
   override def combine(df: DataFrame, cobolSchema: CobolSchema, readerParameters: ReaderParameters): RDD[Array[Byte]] = {
+    val ast = getAst(cobolSchema)
+    validateSchema(df, ast)
+
+    val cobolFields = ast.children.map(_.asInstanceOf[Primitive])
+    val sparkFields = df.schema.fields.map(_.name.toLowerCase)
+
+    cobolFields.foreach(cobolField =>
+      if (cobolField.encode.isEmpty) {
+        throw new IllegalArgumentException(s"Field '${cobolField.name}' does not have an encoding defined in the copybook. 'PIC ${cobolField.dataType.originalPic}' is not yet supported.")
+      }
+    )
+
+    val sparkFieldPositions = cobolFields.map { cobolField =>
+      val fieldName = cobolField.name.toLowerCase
+      val position = sparkFields.indexOf(fieldName)
+
+      if (position < 0) {
+        throw new IllegalArgumentException(s"Field '${cobolField.name}' from the copybook is not found in the DataFrame schema.")
+      }
+
+      position
+    }
+
+    val size = cobolSchema.getRecordSize
+
     df.rdd.map { row =>
-      val r = Random.nextInt(100)
-      val ar = new Array[Byte](10)
+      val ar = new Array[Byte](size)
 
-      util.Arrays.fill(ar, r.toByte)
+      sparkFieldPositions.foreach { index =>
+        val fieldStr = row.get(index)
+        val cobolField = cobolFields(index)
+        cobolSchema.copybook.setPrimitiveField(cobolField, ar, fieldStr, 0)
+      }
 
       ar
     }
   }
+
+  private def validateSchema(df: DataFrame, ast: Group): Unit = {
+    val dfFields = df.schema.fields.map(_.name.toLowerCase).toSet
+
+    val notFoundFields = ast.children.flatMap { field =>
+      if (dfFields.contains(field.name.toLowerCase)) {
+        None
+      } else {
+        Some(field.name)
+      }
+    }
+
+    if (notFoundFields.nonEmpty) {
+      throw new IllegalArgumentException(s"The following fields from the copybook are not found in the DataFrame: ${notFoundFields.mkString(", ")}")
+    }
+
+    val unsupportedDataTypeFields = ast.children.filter { field =>
+      field.isInstanceOf[Group] ||
+        (field.isInstanceOf[Primitive] && field.asInstanceOf[Primitive].occurs.isDefined) ||
+        field.redefines.nonEmpty
+    }
+
+    if (unsupportedDataTypeFields.nonEmpty) {
+      throw new IllegalArgumentException(s"The following fields from the copybook are not supported by the 'spark-cobol' at the moment: " +
+        s"${unsupportedDataTypeFields.map(_.name).mkString(", ")}. Only primitive fields without redefines and occurs are supported.")
+    }
+  }
+
+  private def getAst(cobolSchema: CobolSchema): Group = {
+    val rootAst = cobolSchema.copybook.ast
+
+    if (rootAst.children.length == 1 && rootAst.children.head.isInstanceOf[Group]) {
+      rootAst.children.head.asInstanceOf[Group]
+    } else {
+      rootAst
+    }
+  }
 }
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RawBinaryOutputFormat.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RawBinaryOutputFormat.scala
@@ -16,6 +16,7 @@
 
 package za.co.absa.cobrix.spark.cobol.writer
 
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.io.{BytesWritable, NullWritable}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
@@ -36,15 +37,23 @@ import java.io.DataOutputStream
   * - The key type for the output is `NullWritable` because the key is not used.
   * - The value type for the output is `BytesWritable`, which represents the binary data to be written.
   */
+
 class RawBinaryOutputFormat extends FileOutputFormat[NullWritable, BytesWritable] {
   override def getRecordWriter(context: TaskAttemptContext): RecordWriter[NullWritable, BytesWritable] = {
-    val file = getDefaultWorkFile(context, "")
-    val out: DataOutputStream = file.getFileSystem(context.getConfiguration).create(file)
+    val path: Path = getDefaultWorkFile(context, ".dat")
+    val fs = path.getFileSystem(context.getConfiguration)
+    val out: DataOutputStream = fs.create(path, false)
+
     new RecordWriter[NullWritable, BytesWritable] {
       override def write(key: NullWritable, value: BytesWritable): Unit = {
-        out.write(value.getBytes, 0, value.getLength)
+        if (value != null) {
+          out.write(value.getBytes, 0, value.getLength) // No separator
+        }
+      }
+      override def close(context: TaskAttemptContext): Unit = {
+        out.close()
       }
-      override def close(context: TaskAttemptContext): Unit = out.close()
     }
   }
 }
+
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RecordCombinerSelector.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RecordCombinerSelector.scala
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.spark.cobol.writer
+
+import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
+import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
+
+object RecordCombinerSelector {
+  /**
+    * Selects and returns an appropriate implementation of the `RecordCombiner` based on the provided COBOL schema 
+    * and reader parameters.
+    *
+    * Currently, only basic fixed record length combiner is implemented. 
+    * This method is to be extended as writing capabilities evolve.
+    *
+    * @param cobolSchema      The COBOL schema ot output record.
+    * @param readerParameters Configuration parameters that specify how records should be formed.
+    * @return A `RecordCombiner` implementation suitable for combining records based on the given schema and parameters.
+    */
+  def selectCombiner(cobolSchema: CobolSchema, readerParameters: ReaderParameters): RecordCombiner = {
+    new BasicRecordCombiner
+  }
+
+}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala
@@ -38,7 +38,9 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
 
         val path = new Path(tempDir, "writer1")
         
-        df.write
+        df.repartition(1)
+          .orderBy("A")
+          .write
           .format("cobol")
           .mode(SaveMode.Overwrite)
           .option("copybook_contents", copybookContents)
@@ -50,6 +52,26 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B
         val files = fs.listStatus(path)
           .filter(_.getPath.getName.startsWith("part-"))
         assert(files.nonEmpty, "Output directory should contain part files")
+
+        val partFile = files.head.getPath
+        val data = fs.open(partFile)
+        val bytes = new Array[Byte](files.head.getLen.toInt)
+        data.readFully(bytes)
+        data.close()
+
+        // Expected EBCDIC data for sample test data
+        val expected = Array[Byte](
+          0xC1.toByte, 0xC6.toByte, 0x89.toByte, 0x99.toByte, 0xa2.toByte, 0xa3.toByte, // A,First
+          0xC2.toByte, 0xE2.toByte, 0x83.toByte, 0x95.toByte, 0x84.toByte, 0x00.toByte, // B,Scnd_
+          0xC3.toByte, 0xD3.toByte, 0x81.toByte, 0xa2.toByte, 0xa3.toByte, 0x00.toByte // C,Last_
+        )
+
+        if (!bytes.sameElements(expected)) {
+          println(s"Expected bytes: ${expected.map("%02X" format _).mkString(" ")}")
+          println(s"Actual bytes:   ${bytes.map("%02X" format _).mkString(" ")}")
+
+          assert(bytes.sameElements(expected), "Written data should match expected EBCDIC encoding")
+        }
       }
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,8 @@`
`17`	`17`	`package za.co.absa.cobrix.cobol.parser.ast`
`18`	`18`
`19`	`19`	`import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, CobolType, Decimal, Integral}`
`20`		`-import za.co.absa.cobrix.cobol.parser.decoders.{BinaryUtils, DecoderSelector, EncoderSelector}`
	`20`	`+import za.co.absa.cobrix.cobol.parser.decoders.{BinaryUtils, DecoderSelector}`
	`21`	`+import za.co.absa.cobrix.cobol.parser.encoding.EncoderSelector`
`21`	`22`
`22`	`23`	`/** An abstraction of the statements describing fields of primitive data types in the COBOL copybook`
`23`	`24`	`*`