From 118b3b3cff37e1614aa73207730e3507b9fa6847 Mon Sep 17 00:00:00 2001 From: pomadchin Date: Wed, 17 Apr 2024 13:37:35 +0000 Subject: [PATCH] deploy: 370adc433903863094b706de56154fa59947b605 --- Cats.html | 4 ++-- FeatureOverview.html | 4 ++-- Injection.html | 8 ++++---- Job.html | 4 ++-- TypedDatasetVsSparkDataset.html | 2 +- TypedEncoder.html | 2 +- TypedML.html | 20 ++++++++++---------- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/Cats.html b/Cats.html index 103eb752..ea2181ee 100644 --- a/Cats.html +++ b/Cats.html @@ -166,7 +166,7 @@

count <- typedDs.count[Action]() } yield (sample, count) // result: Action[(Seq[(Int, String)], Long)] = Kleisli( -// cats.data.Kleisli$$Lambda$12732/0x0000000803795840@7e1db744 +// cats.data.Kleisli$$Lambda$12555/0x0000000803795840@3146ae37 // )

As with Job, note that nothing has been run yet. The effect has been properly suspended. To run our program, we must first supply the SparkSession to the ReaderT layer and then @@ -193,7 +193,7 @@

yield r // resultWithDescription: Action[(Seq[(Int, String)], Long)] = Kleisli( -// cats.data.Kleisli$$$Lambda$14206/0x0000000803d5a840@42e7ca37 +// cats.data.Kleisli$$$Lambda$14017/0x0000000803d3d040@164c8e05 // ) resultWithDescription.run(spark).unsafeRunSync() diff --git a/FeatureOverview.html b/FeatureOverview.html index 3f5fba1e..7313fa71 100644 --- a/FeatureOverview.html +++ b/FeatureOverview.html @@ -688,8 +688,8 @@

// +----+-------+--------+--------+ // |city|surface| price|bedrooms| // +----+-------+--------+--------+ -// |Lyon| 83|200000.0| 2| // |Lyon| 45|133000.0| 1| +// |Lyon| 83|200000.0| 2| // +----+-------+--------+--------+ // only showing top 2 rows // @@ -716,7 +716,7 @@

// priceModifier: (String, Double) => Double = <function2> val udf = aptTypedDs.makeUDF(priceModifier) -// udf: (frameless.TypedColumn[Apartment, String], frameless.TypedColumn[Apartment, Double]) => frameless.TypedColumn[Apartment, Double] = frameless.functions.Udf$$Lambda$15277/0x00000008041bc840@704c2976 +// udf: (frameless.TypedColumn[Apartment, String], frameless.TypedColumn[Apartment, Double]) => frameless.TypedColumn[Apartment, Double] = frameless.functions.Udf$$Lambda$15137/0x00000008041d1840@782725bb val aptds = aptTypedDs // For shorter expressions // aptds: TypedDataset[Apartment] = [city: string, surface: int ... 2 more fields] diff --git a/Injection.html b/Injection.html index 73724798..7004e6e0 100644 --- a/Injection.html +++ b/Injection.html @@ -144,7 +144,7 @@

// people: Seq[Person] = List( // Person( // 42, -// java.util.GregorianCalendar[time=1711202985788,areFieldsSet=true,areAllFieldsSet=true,lenient=true,zone=sun.util.calendar.ZoneInfo[id="Etc/UTC",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null],firstDayOfWeek=1,minimalDaysInFirstWeek=1,ERA=1,YEAR=2024,MONTH=2,WEEK_OF_YEAR=12,WEEK_OF_MONTH=4,DAY_OF_MONTH=23,DAY_OF_YEAR=83,DAY_OF_WEEK=7,DAY_OF_WEEK_IN_MONTH=4,AM_PM=1,HOUR=2,HOUR_OF_DAY=14,MINUTE=9,SECOND=45,MILLISECOND=788,ZONE_OFFSET=0,DST_OFFSET=0] +// java.util.GregorianCalendar[time=1713361029652,areFieldsSet=true,areAllFieldsSet=true,lenient=true,zone=sun.util.calendar.ZoneInfo[id="Etc/UTC",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null],firstDayOfWeek=1,minimalDaysInFirstWeek=1,ERA=1,YEAR=2024,MONTH=3,WEEK_OF_YEAR=16,WEEK_OF_MONTH=3,DAY_OF_MONTH=17,DAY_OF_YEAR=108,DAY_OF_WEEK=4,DAY_OF_WEEK_IN_MONTH=3,AM_PM=1,HOUR=1,HOUR_OF_DAY=13,MINUTE=37,SECOND=9,MILLISECOND=652,ZONE_OFFSET=0,DST_OFFSET=0] // ) // )

And an instance of a TypedDataset:

@@ -167,7 +167,7 @@

cal } } -// calendarToLongInjection: AnyRef with Injection[Calendar, Long] = repl.MdocSession$MdocApp0$$anon$1@70c61909 +// calendarToLongInjection: AnyRef with Injection[Calendar, Long] = repl.MdocSession$MdocApp0$$anon$1@5e89e865

We can be less verbose using the Injection.apply function:

import frameless._
 
@@ -180,7 +180,7 @@ 

cal.setTime(new java.util.Date(l)) cal }) -// calendarToLongInjection: Injection[Calendar, Long] = frameless.Injection$$anon$1@7d4fe061

+// calendarToLongInjection: Injection[Calendar, Long] = frameless.Injection$$anon$1@6a5c8e85

Now we can create our TypedDataset:

val personDS = TypedDataset.create(people)
 // personDS: TypedDataset[Person] = [age: int, birthday: bigint]
@@ -214,7 +214,7 @@

case 2 => Female case 3 => Other }) -// genderToInt: Injection[Gender, Int] = frameless.Injection$$anon$1@6647fa98 +// genderToInt: Injection[Gender, Int] = frameless.Injection$$anon$1@691f5577

And now we can create our TypedDataset:

val personDS = TypedDataset.create(people)
 // personDS: TypedDataset[Person] = [age: int, gender: int]
diff --git a/Job.html b/Job.html index a3b04ca5..0e97655f 100644 --- a/Job.html +++ b/Job.html @@ -156,7 +156,7 @@

Job[A]

count <- ds.count() sample <- ds.take((count/5).toInt) } yield sample -// countAndTakeJob: frameless.Job[Seq[Int]] = frameless.Job$$anon$3@4920885a +// countAndTakeJob: frameless.Job[Seq[Int]] = frameless.Job$$anon$3@24c9a2f6 countAndTakeJob.run() // res1: Seq[Int] = WrappedArray(1, 2, 3, 4) @@ -167,7 +167,7 @@

Job[A]

def computeMinOfSample(sample: Job[Seq[Int]]): Job[Int] = sample.map(_.min) val finalJob = computeMinOfSample(countAndTakeJob) -// finalJob: Job[Int] = frameless.Job$$anon$2@52d2f575 +// finalJob: Job[Int] = frameless.Job$$anon$2@347e2262

Now we can execute this new job by specifying a group-id and a description. This allows the programmer to see this information on the Spark UI and help track, say, performance issues.

diff --git a/TypedDatasetVsSparkDataset.html b/TypedDatasetVsSparkDataset.html index fa947c75..9c9dfabd 100644 --- a/TypedDatasetVsSparkDataset.html +++ b/TypedDatasetVsSparkDataset.html @@ -160,8 +160,8 @@

Comparing T // | i| j| // +---+---+ // | 1| Q| -// | 10| W| // |100| E| +// | 10| W| // +---+---+ //

The value ds holds the content of the initialDs read from a parquet file. diff --git a/TypedEncoder.html b/TypedEncoder.html index cf75ed7f..3f027ab8 100644 --- a/TypedEncoder.html +++ b/TypedEncoder.html @@ -212,7 +212,7 @@

Typed Encoders in Frameless// ds: TypedDataset[Foo] = [i: int, b: struct<d: double, s: string>] ds.collect() -// res3: frameless.Job[Seq[Foo]] = frameless.Job$$anon$4@759e2669 +// res3: frameless.Job[Seq[Foo]] = frameless.Job$$anon$4@7f79b512

But any non-encodable in the case class hierarchy will be detected at compile time:

case class BarDate(d: Double, s: String, t: java.util.Calendar)
 case class FooDate(i: Int, b: BarDate)
diff --git a/TypedML.html b/TypedML.html index fbcd9cfa..4b1104de 100644 --- a/TypedML.html +++ b/TypedML.html @@ -176,7 +176,7 @@

case class Features(squareFeet: Double, hasGarden: Boolean) val assembler = TypedVectorAssembler[Features] -// assembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@152c380c +// assembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@6b0ef032 case class HouseDataWithFeatures(squareFeet: Double, hasGarden: Boolean, price: Double, features: Vector) val trainingDataWithFeatures = assembler.transform(trainingData).as[HouseDataWithFeatures] @@ -212,10 +212,10 @@

case class RFInputs(price: Double, features: Vector) val rf = TypedRandomForestRegressor[RFInputs] -// rf: TypedRandomForestRegressor[RFInputs] = frameless.ml.regression.TypedRandomForestRegressor@421d3e99 +// rf: TypedRandomForestRegressor[RFInputs] = frameless.ml.regression.TypedRandomForestRegressor@6aa1198e val model = rf.fit(trainingDataWithFeatures).run() -// model: AppendTransformer[RFInputs, TypedRandomForestRegressor.Outputs, org.apache.spark.ml.regression.RandomForestRegressionModel] = frameless.ml.TypedEstimator$$anon$1@797d26a4 +// model: AppendTransformer[RFInputs, TypedRandomForestRegressor.Outputs, org.apache.spark.ml.regression.RandomForestRegressionModel] = frameless.ml.TypedEstimator$$anon$1@6218df02

TypedRandomForestRegressor[RFInputs] compiles only if RFInputs contains only one field of type Double (the label) and one field of type Vector (the features):

case class WrongRFInputs(labelOfWrongType: String, features: Vector)
@@ -281,7 +281,7 @@

case class Features(price: Double, squareFeet: Double) val vectorAssembler = TypedVectorAssembler[Features] -// vectorAssembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@4ae476de +// vectorAssembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@13f58847 case class HouseDataWithFeatures(squareFeet: Double, city: String, price: Double, features: Vector) val dataWithFeatures = vectorAssembler.transform(trainingData).as[HouseDataWithFeatures] @@ -289,11 +289,11 @@

case class StringIndexerInput(city: String) val indexer = TypedStringIndexer[StringIndexerInput] -// indexer: TypedStringIndexer[StringIndexerInput] = frameless.ml.feature.TypedStringIndexer@5324839c +// indexer: TypedStringIndexer[StringIndexerInput] = frameless.ml.feature.TypedStringIndexer@864cdcc indexer.estimator.setHandleInvalid("keep") -// res12: org.apache.spark.ml.feature.StringIndexer = strIdx_a4d1e92cb0ff +// res12: org.apache.spark.ml.feature.StringIndexer = strIdx_573fe836f1d8 val indexerModel = indexer.fit(dataWithFeatures).run() -// indexerModel: AppendTransformer[StringIndexerInput, TypedStringIndexer.Outputs, org.apache.spark.ml.feature.StringIndexerModel] = frameless.ml.TypedEstimator$$anon$1@46271c4 +// indexerModel: AppendTransformer[StringIndexerInput, TypedStringIndexer.Outputs, org.apache.spark.ml.feature.StringIndexerModel] = frameless.ml.TypedEstimator$$anon$1@20c7e37d case class HouseDataWithFeaturesAndIndex( squareFeet: Double, @@ -307,10 +307,10 @@

case class RFInputs(cityIndexed: Double, features: Vector) val rf = TypedRandomForestClassifier[RFInputs] -// rf: TypedRandomForestClassifier[RFInputs] = frameless.ml.classification.TypedRandomForestClassifier@37fd86e9 +// rf: TypedRandomForestClassifier[RFInputs] = frameless.ml.classification.TypedRandomForestClassifier@497ce755 val model = rf.fit(indexedData).run() -// model: AppendTransformer[RFInputs, TypedRandomForestClassifier.Outputs, org.apache.spark.ml.classification.RandomForestClassificationModel] = frameless.ml.TypedEstimator$$anon$1@5e46544e +// model: AppendTransformer[RFInputs, TypedRandomForestClassifier.Outputs, org.apache.spark.ml.classification.RandomForestClassificationModel] = frameless.ml.TypedEstimator$$anon$1@7674f98d

Prediction

We now want to predict city for testData using the previously trained model. Like the Spark ML API, @@ -342,7 +342,7 @@

case class IndexToStringInput(predictedCityIndexed: Double) val indexToString = TypedIndexToString[IndexToStringInput](indexerModel.transformer.labels) -// indexToString: TypedIndexToString[IndexToStringInput] = frameless.ml.feature.TypedIndexToString@29891479 +// indexToString: TypedIndexToString[IndexToStringInput] = frameless.ml.feature.TypedIndexToString@58d05022 case class HouseCityPrediction( features: Vector,