From b1fe76056f83cef4de45404973754c6a47b7e7a2 Mon Sep 17 00:00:00 2001 From: NickEdwards7502 Date: Thu, 19 Sep 2024 17:29:00 +1000 Subject: [PATCH] DEV: Update FeatureSource dataframe conversion (#237) REFACTOR: Remove conversion of whole RDD to DataFrame FEAT: Add function for slicing rows and columns and converting to DF --- .../variantspark/input/VCFFeatureSource.scala | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/main/scala/au/csiro/variantspark/input/VCFFeatureSource.scala b/src/main/scala/au/csiro/variantspark/input/VCFFeatureSource.scala index d8631f32..5bf7d9e2 100644 --- a/src/main/scala/au/csiro/variantspark/input/VCFFeatureSource.scala +++ b/src/main/scala/au/csiro/variantspark/input/VCFFeatureSource.scala @@ -60,20 +60,19 @@ class VCFFeatureSource(vcfSource: VCFSource, converter: VariantToFeatureConverte vcfSource.genotypes().map(converterRef.convert) } - lazy val sampleNamesStructArr: Array[StructField] = - sampleNames.map(StructField(_, ByteType, true)).toArray - - lazy val featureDFSchema: StructType = - StructType(Seq(StructField("variant_id", StringType, true)) ++ sampleNamesStructArr) - - def toDF(sqlContext: SQLContext): DataFrame = { + def head(sqlContext: SQLContext, rowLim: Int = 10, colLim: Int = 10): DataFrame = { + lazy val sampleNamesStructArr: Array[StructField] = + sampleNames.take(colLim).map(StructField(_, ByteType, true)).toArray + lazy val featureDFSchema: StructType = + StructType(Seq(StructField("variant_id", StringType, true)) ++ sampleNamesStructArr) val sc = sqlContext.sparkContext - val featureRDD: RDD[Row] = - features.mapPartitions { it => - it.map { f => Row.fromSeq(f.label +: f.valueAsByteArray.toSeq) } + val slicedFeatureArray: Array[Row] = + features.take(rowLim).map { f => + Row.fromSeq(f.label +: f.valueAsByteArray.take(colLim).toSeq) } - sqlContext.createDataFrame(featureRDD, featureDFSchema) + val slicedFeatureRDD: RDD[Row] = sc.parallelize(slicedFeatureArray) + sqlContext.createDataFrame(slicedFeatureRDD, featureDFSchema) } }