#415 Implement a simple binary record writer.

yruslan · yruslan · commit ed463d44ed1c · 2025-08-14T16:14:00.000+02:00
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParametersParser.scala
@@ -246,9 +246,14 @@ object CobolParametersParser extends Logging {
       recordFormatDefined
     }
 
+    val copybookPaths = params.get(PARAM_MULTI_COPYBOOK_PATH) match {
+      case Some(paths) => paths.split(',').toSeq
+      case None => Seq.empty[String]
+    }
+
     val cobolParameters = CobolParameters(
       getParameter(PARAM_COPYBOOK_PATH, params),
-      params.getOrElse(PARAM_MULTI_COPYBOOK_PATH, "").split(','),
+      copybookPaths,
       getParameter(PARAM_COPYBOOK_CONTENTS, params),
       paths,
       recordFormat,
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/DefaultSource.scala
@@ -17,16 +17,19 @@
 package za.co.absa.cobrix.spark.cobol.source
 
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.{BytesWritable, NullWritable}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}
 import za.co.absa.cobrix.cobol.internal.Logging
 import za.co.absa.cobrix.cobol.reader.parameters.{CobolParameters, CobolParametersParser, Parameters}
 import za.co.absa.cobrix.cobol.reader.parameters.CobolParametersParser._
+import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
 import za.co.absa.cobrix.spark.cobol.reader._
 import za.co.absa.cobrix.spark.cobol.source.copybook.CopybookContentLoader
 import za.co.absa.cobrix.spark.cobol.source.parameters._
 import za.co.absa.cobrix.spark.cobol.utils.{BuildProperties, SparkUtils}
+import za.co.absa.cobrix.spark.cobol.writer.{BasicRecordCombiner, RawBinaryOutputFormat}
 
 /**
   * This class represents a Cobol data source.
@@ -65,6 +68,11 @@ class DefaultSource
     val path = parameters.getOrElse("path",
       throw new IllegalArgumentException("Path is required for this data source."))
 
+    val cobolParameters = CobolParametersParser.parse(new Parameters(parameters))
+    CobolParametersValidator.checkSanity(cobolParameters)
+
+    val readerParameters = CobolParametersParser.getReaderProperties(cobolParameters, None)
+
     mode match {
       case SaveMode.Overwrite =>
         val outputPath = new Path(path)
@@ -77,10 +85,20 @@ class DefaultSource
       case _ =>
     }
 
-    // Simply save each row as comma-separated values in a text file
-    data.rdd
-      .map(row => row.mkString(","))
-      .saveAsTextFile(path)
+    val copybookContent = CopybookContentLoader.load(cobolParameters, sqlContext.sparkContext.hadoopConfiguration)
+    val cobolSchema = CobolSchema.fromReaderParameters(copybookContent, readerParameters)
+
+    val combiner = new BasicRecordCombiner
+
+    val rdd = combiner.combine(data, cobolSchema, readerParameters)
+
+    rdd.map(bytes => (NullWritable.get(), new BytesWritable(bytes)))
+      .saveAsNewAPIHadoopFile(
+        path,
+        classOf[NullWritable],
+        classOf[BytesWritable],
+        classOf[RawBinaryOutputFormat]
+      )
 
     new BaseRelation {
       override def sqlContext: SQLContext = sqlContext
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/BasicRecordCombiner.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/BasicRecordCombiner.scala
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.spark.cobol.writer
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
+import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
+import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
+
+import java.util
+import scala.util.Random
+
+class BasicRecordCombiner extends RecordCombiner {
+  override def combine(df: DataFrame, cobolSchema: CobolSchema, readerParameters: ReaderParameters): RDD[Array[Byte]] = {
+    df.rdd.map { row =>
+      val r = Random.nextInt(100)
+      val ar = new Array[Byte](10)
+
+      util.Arrays.fill(ar, r.toByte)
+
+      ar
+    }
+  }
+}
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RawBinaryOutputFormat.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RawBinaryOutputFormat.scala
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.spark.cobol.writer
+
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.io.{BytesWritable, NullWritable}
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
+
+import java.io.DataOutputStream
+
+/**
+  * A custom implementation of `FileOutputFormat` that outputs raw binary data for fixed record length
+  * outputs or for variable record length outputs when record size headers are already embedded into
+  * each record array of bytes.
+  *
+  * The `RawBinaryOutputFormat` class is designed to write binary data into output files
+  * without adding any additional structure or metadata. Each record is directly written
+  * as a stream of bytes to the output.
+  *
+  * This output format only handles records that are represented as `BytesWritable` and ignores the key.
+  *
+  * - The key type for the output is `NullWritable` because the key is not used.
+  * - The value type for the output is `BytesWritable`, which represents the binary data to be written.
+  */
+class RawBinaryOutputFormat extends FileOutputFormat[NullWritable, BytesWritable] {
+  override def getRecordWriter(context: TaskAttemptContext): RecordWriter[NullWritable, BytesWritable] = {
+    val file = getDefaultWorkFile(context, "")
+    val out: DataOutputStream = file.getFileSystem(context.getConfiguration).create(file)
+    new RecordWriter[NullWritable, BytesWritable] {
+      override def write(key: NullWritable, value: BytesWritable): Unit = {
+        out.write(value.getBytes, 0, value.getLength)
+      }
+      override def close(context: TaskAttemptContext): Unit = out.close()
+    }
+  }
+}
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RecordCombiner.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/writer/RecordCombiner.scala
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.spark.cobol.writer
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
+import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
+import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
+
+trait RecordCombiner {
+  def combine(df: DataFrame, cobolSchema: CobolSchema, readerParameters: ReaderParameters): RDD[Array[Byte]]
+}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/WriterSourceSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/WriterSourceSpec.scala
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.spark.cobol.writer
+
+import org.apache.hadoop.fs.Path
+import org.apache.spark.sql.SaveMode
+import org.scalatest.wordspec.AnyWordSpec
+import za.co.absa.cobrix.spark.cobol.source.base.SparkTestBase
+import za.co.absa.cobrix.spark.cobol.source.fixtures.BinaryFileFixture
+
+class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with BinaryFileFixture {
+  import spark.implicits._
+
+  private val copybookContents =
+    """       01  RECORD.
+           05  A       PIC X(1).
+           05  B       PIC X(5).
+    """
+
+  "cobol writer" should {
+    "write simple fixed-record-length EBCDIC data files" in {
+      withTempDirectory("cobol_writer1") { tempDir =>
+        val df = List(("A", "First"), ("B", "Scnd"), ("C", "Last")).toDF("A", "B")
+
+        val path = new Path(tempDir, "writer1")
+        
+        df.write
+          .format("cobol")
+          .mode(SaveMode.Overwrite)
+          .option("copybook_contents", copybookContents)
+          .save(path.toString)
+
+        val fs = path.getFileSystem(spark.sparkContext.hadoopConfiguration)
+
+        assert(fs.exists(path), "Output directory should exist")
+        val files = fs.listStatus(path)
+          .filter(_.getPath.getName.startsWith("part-"))
+        assert(files.nonEmpty, "Output directory should contain part files")
+      }
+    }
+  }
+
+}