From 03b79393e71910a33a39864e563fcbeb2de56658 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sun, 19 Apr 2020 22:31:05 -0700 Subject: [PATCH 01/35] Adding section for UDF serialization --- docs/broadcast-guide.md | 92 +++++++++++++++++++++ docs/udf-guide.md | 172 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 264 insertions(+) create mode 100644 docs/broadcast-guide.md create mode 100644 docs/udf-guide.md diff --git a/docs/broadcast-guide.md b/docs/broadcast-guide.md new file mode 100644 index 000000000..4286c569e --- /dev/null +++ b/docs/broadcast-guide.md @@ -0,0 +1,92 @@ +# Guide to using Broadcast Variables + +This is a guide to show how to use broadcast variables in .NET for Apache Spark. + +## What are Broadcast Variables + +[Broadcast variables in Apache Spark](https://spark.apache.org/docs/2.2.0/rdd-programming-guide.html#broadcast-variables) are a mechanism for sharing variables across executors that are meant to be read-only. They allow the programmer to keep a read-only variable cached on each machine rather than shipping a copy of it with tasks. They can be used, for example, to give every node a copy of a large input dataset in an efficient manner. + +### How to use broadcast variables in .NET for Apache Spark + +Broadcast variables are created from a variable `v` by calling `SparkContext.Broadcast(v)`. The broadcast variable is a wrapper around `v`, and its value can be accessed by calling the `Value()` method on it. + +Example: + +```csharp +string v = "Variable to be broadcasted"; +Broadcast bv = SparkContext.Broadcast(v); + +// Using the broadcast variable in a UDF: +Func udf = Udf( + str => $"{str}: {bv.Value()}"); +``` + +The type of broadcast variable is captured by using Generics in C#, as can be seen in the above example. + +### Deleting broadcast variables + +The broadcast variable can be deleted from all executors by calling the `Destroy()` function on it. + +```csharp +// Destroying the broadcast variable bv: +bv.Destroy(); +``` + +> Note: `Destroy` deletes all data and metadata related to the broadcast variable. Use this with caution- once a broadcast variable has been destroyed, it cannot be used again. + +#### Caveat of using Destroy + +One important thing to keep in mind while using broadcast variables in UDFs is to limit the scope of the variable to only the UDF that is referencing it. The [guide to using UDFs](udf-guide.md) describes this phenomenon in detail. This is especially crucial when calling `Destroy` on the broadcast variable. If the broadcast variable that has been destroyed is visible to or accessible from other UDFs, it gets picked up for serialization by all those UDFs, even if it is not being referenced by them. This will throw an error as .NET for Apache Spark is not able to serialize the destroyed broadcast variable. + +Example to demonstrate: + +```csharp +string v = "Variable to be broadcasted"; +Broadcast bv = SparkContext.Broadcast(v); + +// Using the broadcast variable in a UDF: +Func udf1 = Udf( + str => $"{str}: {bv.Value()}"); + +// Destroying bv +bv.Destroy(); + +// Calling udf1 after destroying bv throws the following expected exception: +// org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed +df.Select(udf1(df["_1"])).Show(); + +// Different UDF udf2 that is not referencing bv +Func udf2 = Udf( + str => $"{str}: not referencing broadcast variable"); + +// Calling udf2 throws the following (unexpected) exception: +// [Error] [JvmBridge] org.apache.spark.SparkException: Task not serializable +df.Select(udf2(df["_1"])).Show(); +``` + +The recommended way of implementing above desired behavior: + +```csharp +string v = "Variable to be broadcasted"; +// Restricting the visibility of bv to only the UDF referencing it +{ + Broadcast bv = SparkContext.Broadcast(v); + + // Using the broadcast variable in a UDF: + Func udf1 = Udf( + str => $"{str}: {bv.Value()}"); + + // Destroying bv + bv.Destroy(); +} + +// Different UDF udf2 that is not referencing bv +Func udf2 = Udf( + str => $"{str}: not referencing broadcast variable"); + +// Calling udf2 works fine as expected +df.Select(udf2(df["_1"])).Show(); +``` + This ensures that destroying `bv` doesn't affect calling `udf2` because of unexpected serialization behavior. + + Broadcast variables are very useful for transmitting read-only data to all executors, as the data is sent only once and this gives huge performance benefits when compared with using local variables that get shipped to the executors with each task. Please refer to the [official documentation](https://spark.apache.org/docs/2.2.0/rdd-programming-guide.html#broadcast-variables) to get a deeper understanding of broadcast variables and why they are used. \ No newline at end of file diff --git a/docs/udf-guide.md b/docs/udf-guide.md new file mode 100644 index 000000000..bb308815d --- /dev/null +++ b/docs/udf-guide.md @@ -0,0 +1,172 @@ +# Guide to User-Defined Functions (UDFs) + +This is a guide to show how to use UDFs in .NET for Apache Spark. + +## What are UDFs + +[User-Defined Functions (UDFs)](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/expressions/UserDefinedFunction.html) are a feature of Spark that allow developers to use custom functions to extend the system's built-in functionality. They transform values from a single row within a table to produce a single corresponding output value per row based on the logic defined in the UDF. + +Let's take the following as an example for a UDF definition: + +```csharp +string s1 = "hello"; +Func udf = Udf( + str => $"{s1} {str}"); + +``` +The above defined UDF takes a `string` as an input (in the form of a [Column](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/Column.cs#L14) of a [Dataframe](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/DataFrame.cs#L24)), and returns a `string` with `hello` appended in front of the input. + +For a sample Dataframe, let's take the following Dataframe `df`: + +```text ++-------+ +| name| ++-------+ +|Michael| +| Andy| +| Justin| ++-------+ +``` + +Now let's apply the above defined `udf` to the dataframe `df`: + +```csharp +DataFrame udfResult = df.Select(udf(df["name"])); +``` + +This would return the below as the Dataframe `udfResult`: + +```text ++-------------+ +| name| ++-------------+ +|hello Michael| +| hello Andy| +| hello Justin| ++-------------+ +``` +To get a better understanding of how to implement UDFs, please take a look at the [UDF helper functions](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/Functions.cs#L3616) and some [test examples](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs#L49). + +## UDF serialization + +Since UDFs are functions that need to be executed on the workers, they have to be serialized and sent to the workers as part of the payload from the driver. This involves serializing the [delegate](https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/delegates/) which is a reference to the method, along with its [target](https://docs.microsoft.com/en-us/dotnet/api/system.delegate.target?view=netframework-4.8) which is the class instance on which the current delegate invokes the instance method. Please take a look at this [code](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Utils/CommandSerDe.cs#L149) to get a better understanding of how UDF serialization is being done. + +## Good to know while implementing UDFs + +One behavior to be aware of while implementing UDFs in .NET for Apache Spark is how the target of the UDF gets serialized. .NET for Apache Spark uses .NET Core, which does not support serializing delegates, so it is instead done by using reflection to serialize the target where the delegate is defined. When multiple delegates are defined in a common scope, they have a shared closure that becomes the target of reflection for serialization. Let's take an example to illustrate what that means. + +The following code snippet defines two string variables that are being referenced in two function delegates, that just return the respective strings as result: + +```csharp +using System; + +public class C { + public void M() { + string s1 = "s1"; + string s2 = "s2"; + Func a = str => s1; + Func b = str => s2; + } +} +``` + +The above C# code generates the following C# disassembly (credit source: [sharplab.io](sharplab.io)) code from the compiler: + +```csharp +public class C +{ + [CompilerGenerated] + private sealed class <>c__DisplayClass0_0 + { + public string s1; + + public string s2; + + internal string b__0(string str) + { + return s1; + } + + internal string b__1(string str) + { + return s2; + } + } + + public void M() + { + <>c__DisplayClass0_0 <>c__DisplayClass0_ = new <>c__DisplayClass0_0(); + <>c__DisplayClass0_.s1 = "s1"; + <>c__DisplayClass0_.s2 = "s2"; + Func func = new Func(<>c__DisplayClass0_.b__0); + Func func2 = new Func(<>c__DisplayClass0_.b__1); + } +} +``` +As can be seen in the above IL code, both `func` and `func2` share the same closure `<>c__DisplayClass0_0`, which is the target that is serialized when serializing the delegates `func` and `func2`. Hence, even though `Func a` is only referencing `s1`, `s2` also gets serialized when sending over the bytes to the workers. + +This can lead to some unexpected behaviors at runtime (like in the case of using [broadcast variables](broadcast-guide.md)), which is why we recommend restricting the visibility of the variables used in a function to that function's scope. +Taking the above example to better explain what that means: + +Recommended user code to implement desired behavior of previous code snippet: + +```csharp +using System; + +public class C { + public void M() { + { + string s1 = "s1"; + Func a = str => s1; + } + { + string s2 = "s2"; + Func b = str => s2; + } + } +} +``` + +The above C# code generates the following C# disassembly (credit source: [sharplab.io](sharplab.io)) code from the compiler: + +```csharp +public class C +{ + [CompilerGenerated] + private sealed class <>c__DisplayClass0_0 + { + public string s1; + + internal string b__0(string str) + { + return s1; + } + } + + [CompilerGenerated] + private sealed class <>c__DisplayClass0_1 + { + public string s2; + + internal string b__1(string str) + { + return s2; + } + } + + public void M() + { + <>c__DisplayClass0_0 <>c__DisplayClass0_ = new <>c__DisplayClass0_0(); + <>c__DisplayClass0_.s1 = "s1"; + Func func = new Func(<>c__DisplayClass0_.b__0); + <>c__DisplayClass0_1 <>c__DisplayClass0_2 = new <>c__DisplayClass0_1(); + <>c__DisplayClass0_2.s2 = "s2"; + Func func2 = new Func(<>c__DisplayClass0_2.b__1); + } +} +``` + +Here we see that `func` and `func2` no longer share a closure and have their own separate closures `<>c__DisplayClass0_0` and `<>c__DisplayClass0_1` respectively. When used as the target for serialization, nothing other than the referenced variables will get serialized for the delegate. + +This above behavior is important to keep in mind while implementing multiple UDFs in a common scope. +To learn more about UDFs in general, please review the following articles that explain UDFs and how to use them: [UDFs in databricks(scala)](https://docs.databricks.com/spark/latest/spark-sql/udf-scala.html), [Spark UDFs and some gotchas](https://medium.com/@achilleus/spark-udfs-we-can-use-them-but-should-we-use-them-2c5a561fde6d). \ No newline at end of file From 4ef693dbf7616b738a6ae70d1e9dc8c12dd8e5d3 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sun, 19 Apr 2020 22:32:56 -0700 Subject: [PATCH 02/35] removing guides from master --- docs/broadcast-guide.md | 92 --------------------- docs/udf-guide.md | 172 ---------------------------------------- 2 files changed, 264 deletions(-) delete mode 100644 docs/broadcast-guide.md delete mode 100644 docs/udf-guide.md diff --git a/docs/broadcast-guide.md b/docs/broadcast-guide.md deleted file mode 100644 index 4286c569e..000000000 --- a/docs/broadcast-guide.md +++ /dev/null @@ -1,92 +0,0 @@ -# Guide to using Broadcast Variables - -This is a guide to show how to use broadcast variables in .NET for Apache Spark. - -## What are Broadcast Variables - -[Broadcast variables in Apache Spark](https://spark.apache.org/docs/2.2.0/rdd-programming-guide.html#broadcast-variables) are a mechanism for sharing variables across executors that are meant to be read-only. They allow the programmer to keep a read-only variable cached on each machine rather than shipping a copy of it with tasks. They can be used, for example, to give every node a copy of a large input dataset in an efficient manner. - -### How to use broadcast variables in .NET for Apache Spark - -Broadcast variables are created from a variable `v` by calling `SparkContext.Broadcast(v)`. The broadcast variable is a wrapper around `v`, and its value can be accessed by calling the `Value()` method on it. - -Example: - -```csharp -string v = "Variable to be broadcasted"; -Broadcast bv = SparkContext.Broadcast(v); - -// Using the broadcast variable in a UDF: -Func udf = Udf( - str => $"{str}: {bv.Value()}"); -``` - -The type of broadcast variable is captured by using Generics in C#, as can be seen in the above example. - -### Deleting broadcast variables - -The broadcast variable can be deleted from all executors by calling the `Destroy()` function on it. - -```csharp -// Destroying the broadcast variable bv: -bv.Destroy(); -``` - -> Note: `Destroy` deletes all data and metadata related to the broadcast variable. Use this with caution- once a broadcast variable has been destroyed, it cannot be used again. - -#### Caveat of using Destroy - -One important thing to keep in mind while using broadcast variables in UDFs is to limit the scope of the variable to only the UDF that is referencing it. The [guide to using UDFs](udf-guide.md) describes this phenomenon in detail. This is especially crucial when calling `Destroy` on the broadcast variable. If the broadcast variable that has been destroyed is visible to or accessible from other UDFs, it gets picked up for serialization by all those UDFs, even if it is not being referenced by them. This will throw an error as .NET for Apache Spark is not able to serialize the destroyed broadcast variable. - -Example to demonstrate: - -```csharp -string v = "Variable to be broadcasted"; -Broadcast bv = SparkContext.Broadcast(v); - -// Using the broadcast variable in a UDF: -Func udf1 = Udf( - str => $"{str}: {bv.Value()}"); - -// Destroying bv -bv.Destroy(); - -// Calling udf1 after destroying bv throws the following expected exception: -// org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed -df.Select(udf1(df["_1"])).Show(); - -// Different UDF udf2 that is not referencing bv -Func udf2 = Udf( - str => $"{str}: not referencing broadcast variable"); - -// Calling udf2 throws the following (unexpected) exception: -// [Error] [JvmBridge] org.apache.spark.SparkException: Task not serializable -df.Select(udf2(df["_1"])).Show(); -``` - -The recommended way of implementing above desired behavior: - -```csharp -string v = "Variable to be broadcasted"; -// Restricting the visibility of bv to only the UDF referencing it -{ - Broadcast bv = SparkContext.Broadcast(v); - - // Using the broadcast variable in a UDF: - Func udf1 = Udf( - str => $"{str}: {bv.Value()}"); - - // Destroying bv - bv.Destroy(); -} - -// Different UDF udf2 that is not referencing bv -Func udf2 = Udf( - str => $"{str}: not referencing broadcast variable"); - -// Calling udf2 works fine as expected -df.Select(udf2(df["_1"])).Show(); -``` - This ensures that destroying `bv` doesn't affect calling `udf2` because of unexpected serialization behavior. - - Broadcast variables are very useful for transmitting read-only data to all executors, as the data is sent only once and this gives huge performance benefits when compared with using local variables that get shipped to the executors with each task. Please refer to the [official documentation](https://spark.apache.org/docs/2.2.0/rdd-programming-guide.html#broadcast-variables) to get a deeper understanding of broadcast variables and why they are used. \ No newline at end of file diff --git a/docs/udf-guide.md b/docs/udf-guide.md deleted file mode 100644 index bb308815d..000000000 --- a/docs/udf-guide.md +++ /dev/null @@ -1,172 +0,0 @@ -# Guide to User-Defined Functions (UDFs) - -This is a guide to show how to use UDFs in .NET for Apache Spark. - -## What are UDFs - -[User-Defined Functions (UDFs)](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/expressions/UserDefinedFunction.html) are a feature of Spark that allow developers to use custom functions to extend the system's built-in functionality. They transform values from a single row within a table to produce a single corresponding output value per row based on the logic defined in the UDF. - -Let's take the following as an example for a UDF definition: - -```csharp -string s1 = "hello"; -Func udf = Udf( - str => $"{s1} {str}"); - -``` -The above defined UDF takes a `string` as an input (in the form of a [Column](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/Column.cs#L14) of a [Dataframe](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/DataFrame.cs#L24)), and returns a `string` with `hello` appended in front of the input. - -For a sample Dataframe, let's take the following Dataframe `df`: - -```text -+-------+ -| name| -+-------+ -|Michael| -| Andy| -| Justin| -+-------+ -``` - -Now let's apply the above defined `udf` to the dataframe `df`: - -```csharp -DataFrame udfResult = df.Select(udf(df["name"])); -``` - -This would return the below as the Dataframe `udfResult`: - -```text -+-------------+ -| name| -+-------------+ -|hello Michael| -| hello Andy| -| hello Justin| -+-------------+ -``` -To get a better understanding of how to implement UDFs, please take a look at the [UDF helper functions](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/Functions.cs#L3616) and some [test examples](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs#L49). - -## UDF serialization - -Since UDFs are functions that need to be executed on the workers, they have to be serialized and sent to the workers as part of the payload from the driver. This involves serializing the [delegate](https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/delegates/) which is a reference to the method, along with its [target](https://docs.microsoft.com/en-us/dotnet/api/system.delegate.target?view=netframework-4.8) which is the class instance on which the current delegate invokes the instance method. Please take a look at this [code](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Utils/CommandSerDe.cs#L149) to get a better understanding of how UDF serialization is being done. - -## Good to know while implementing UDFs - -One behavior to be aware of while implementing UDFs in .NET for Apache Spark is how the target of the UDF gets serialized. .NET for Apache Spark uses .NET Core, which does not support serializing delegates, so it is instead done by using reflection to serialize the target where the delegate is defined. When multiple delegates are defined in a common scope, they have a shared closure that becomes the target of reflection for serialization. Let's take an example to illustrate what that means. - -The following code snippet defines two string variables that are being referenced in two function delegates, that just return the respective strings as result: - -```csharp -using System; - -public class C { - public void M() { - string s1 = "s1"; - string s2 = "s2"; - Func a = str => s1; - Func b = str => s2; - } -} -``` - -The above C# code generates the following C# disassembly (credit source: [sharplab.io](sharplab.io)) code from the compiler: - -```csharp -public class C -{ - [CompilerGenerated] - private sealed class <>c__DisplayClass0_0 - { - public string s1; - - public string s2; - - internal string b__0(string str) - { - return s1; - } - - internal string b__1(string str) - { - return s2; - } - } - - public void M() - { - <>c__DisplayClass0_0 <>c__DisplayClass0_ = new <>c__DisplayClass0_0(); - <>c__DisplayClass0_.s1 = "s1"; - <>c__DisplayClass0_.s2 = "s2"; - Func func = new Func(<>c__DisplayClass0_.b__0); - Func func2 = new Func(<>c__DisplayClass0_.b__1); - } -} -``` -As can be seen in the above IL code, both `func` and `func2` share the same closure `<>c__DisplayClass0_0`, which is the target that is serialized when serializing the delegates `func` and `func2`. Hence, even though `Func a` is only referencing `s1`, `s2` also gets serialized when sending over the bytes to the workers. - -This can lead to some unexpected behaviors at runtime (like in the case of using [broadcast variables](broadcast-guide.md)), which is why we recommend restricting the visibility of the variables used in a function to that function's scope. -Taking the above example to better explain what that means: - -Recommended user code to implement desired behavior of previous code snippet: - -```csharp -using System; - -public class C { - public void M() { - { - string s1 = "s1"; - Func a = str => s1; - } - { - string s2 = "s2"; - Func b = str => s2; - } - } -} -``` - -The above C# code generates the following C# disassembly (credit source: [sharplab.io](sharplab.io)) code from the compiler: - -```csharp -public class C -{ - [CompilerGenerated] - private sealed class <>c__DisplayClass0_0 - { - public string s1; - - internal string b__0(string str) - { - return s1; - } - } - - [CompilerGenerated] - private sealed class <>c__DisplayClass0_1 - { - public string s2; - - internal string b__1(string str) - { - return s2; - } - } - - public void M() - { - <>c__DisplayClass0_0 <>c__DisplayClass0_ = new <>c__DisplayClass0_0(); - <>c__DisplayClass0_.s1 = "s1"; - Func func = new Func(<>c__DisplayClass0_.b__0); - <>c__DisplayClass0_1 <>c__DisplayClass0_2 = new <>c__DisplayClass0_1(); - <>c__DisplayClass0_2.s2 = "s2"; - Func func2 = new Func(<>c__DisplayClass0_2.b__1); - } -} -``` - -Here we see that `func` and `func2` no longer share a closure and have their own separate closures `<>c__DisplayClass0_0` and `<>c__DisplayClass0_1` respectively. When used as the target for serialization, nothing other than the referenced variables will get serialized for the delegate. - -This above behavior is important to keep in mind while implementing multiple UDFs in a common scope. -To learn more about UDFs in general, please review the following articles that explain UDFs and how to use them: [UDFs in databricks(scala)](https://docs.databricks.com/spark/latest/spark-sql/udf-scala.html), [Spark UDFs and some gotchas](https://medium.com/@achilleus/spark-udfs-we-can-use-them-but-should-we-use-them-2c5a561fde6d). \ No newline at end of file From 6bab99604db5cc8b8528b54216085afb96cbaff7 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 27 Jul 2020 21:10:51 +0100 Subject: [PATCH 03/35] CountVectorizer --- .../ML/Feature/CountVectorizerModelTests.cs | 73 +++++++ .../ML/Feature/CountVectorizerTests.cs | 70 +++++++ .../ML/Feature/CountVectorizer.cs | 195 ++++++++++++++++++ .../ML/Feature/CountVectorizerModel.cs | 170 +++++++++++++++ 4 files changed, 508 insertions(+) create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs new file mode 100644 index 000000000..3c3132dd9 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs @@ -0,0 +1,73 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class CountVectorizerModelTests + { + private readonly SparkSession _spark; + + public CountVectorizerModelTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void Test_CountVectorizerModel() + { + DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " + + "'TOKENIZE') as input from range(100)"); + + const string inputColumn = "input"; + const string outputColumn = "output"; + const double minTf = 10.0; + const bool binary = false; + + List vocabulary = new List() + { + "hello", + "I", + "AM", + "TO", + "TOKENIZE" + }; + + var countVectorizerModel = new CountVectorizerModel(vocabulary); + + Assert.IsType(new CountVectorizerModel("my-uid", vocabulary)); + + countVectorizerModel = countVectorizerModel + .SetInputCol(inputColumn) + .SetOutputCol(outputColumn) + .SetMinTF(minTf) + .SetBinary(binary); + + Assert.Equal(inputColumn, countVectorizerModel.GetInputCol()); + Assert.Equal(outputColumn, countVectorizerModel.GetOutputCol()); + Assert.Equal(minTf, countVectorizerModel.GetMinTF()); + Assert.Equal(binary, countVectorizerModel.GetBinary()); + using (var tempDirectory = new TemporaryDirectory()) + { + string savePath = Path.Join(tempDirectory.Path, "countVectorizerModel"); + countVectorizerModel.Save(savePath); + + CountVectorizerModel loadedModel = CountVectorizerModel.Load(savePath); + Assert.Equal(countVectorizerModel.Uid(), loadedModel.Uid()); + } + + Assert.IsType(countVectorizerModel.GetVocabSize()); + Assert.NotEmpty(countVectorizerModel.ExplainParams()); + Assert.NotEmpty(countVectorizerModel.ToString()); + } + } +} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs new file mode 100644 index 000000000..d54bfe376 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs @@ -0,0 +1,70 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class CountVectorizerTests + { + private readonly SparkSession _spark; + + public CountVectorizerTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void Test_CountVectorizer() + { + DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " + + "'TOKENIZE') as input from range(100)"); + + const string inputColumn = "input"; + const string outputColumn = "output"; + const double minDf = 1; + const double maxDf = 100; + const double minTf = 10; + const int vocabSize = 10000; + const bool binary = false; + + var countVectorizer = new CountVectorizer(); + + countVectorizer + .SetInputCol(inputColumn) + .SetOutputCol(outputColumn) + .SetMinDF(minDf) + .SetMaxDF(maxDf) + .SetMinTF(minTf) + .SetVocabSize(vocabSize); + + Assert.IsType(countVectorizer.Fit(input)); + Assert.Equal(inputColumn, countVectorizer.GetInputCol()); + Assert.Equal(outputColumn, countVectorizer.GetOutputCol()); + Assert.Equal(minDf, countVectorizer.GetMinDF()); + Assert.Equal(maxDf, countVectorizer.GetMaxDF()); + Assert.Equal(minTf, countVectorizer.GetMinTF()); + Assert.Equal(vocabSize, countVectorizer.GetVocabSize()); + Assert.Equal(binary, countVectorizer.GetBinary()); + + using (var tempDirectory = new TemporaryDirectory()) + { + string savePath = Path.Join(tempDirectory.Path, "countVectorizer"); + countVectorizer.Save(savePath); + + CountVectorizer loadedVectorizer = CountVectorizer.Load(savePath); + Assert.Equal(countVectorizer.Uid(), loadedVectorizer.Uid()); + } + + Assert.NotEmpty(countVectorizer.ExplainParams()); + Assert.NotEmpty(countVectorizer.ToString()); + } + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs new file mode 100644 index 000000000..41e0dbdd0 --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs @@ -0,0 +1,195 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; + +namespace Microsoft.Spark.ML.Feature +{ + public class CountVectorizer : FeatureBase, IJvmObjectReferenceProvider + { + private static readonly string s_countVectorizerClassName = + "org.apache.spark.ml.feature.CountVectorizer"; + + /// + /// Create a without any parameters + /// + public CountVectorizer() : base(s_countVectorizerClassName) + { + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + public CountVectorizer(string uid) : base(s_countVectorizerClassName, uid) + { + } + + internal CountVectorizer(JvmObjectReference jvmObject) : base(jvmObject) + { + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// Fits a model to the input data. + /// The to fit the model to. + /// + public CountVectorizerModel Fit(DataFrame dataFrame) => + new CountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("fit", dataFrame)); + + /// + /// Loads the that was previously saved using Save + /// + /// + /// The path the previous was saved to + /// + /// New object + public static CountVectorizer Load(string path) => + WrapAsType((JvmObjectReference) + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_countVectorizerClassName,"load", path)); + + /// + /// Gets the binary toggle to control the output vector values. If True, all nonzero counts + /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic + /// models that model binary events rather than integer counts. Default: false + /// + /// boolean + public bool GetBinary() => (bool)_jvmObject.Invoke("getBinary"); + + /// + /// Sets the binary toggle to control the output vector values. If True, all nonzero counts + /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic + /// models that model binary events rather than integer counts. Default: false + /// + /// Turn the binary toggle on or off + /// with the new binary toggle value set + public CountVectorizer SetBinary(bool value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); + + private static CountVectorizer WrapAsCountVectorizer(object obj) => + new CountVectorizer((JvmObjectReference)obj); + + /// + /// Gets the column that the should read from and convert + /// into buckets. This would have been set by SetInputCol + /// + /// string, the input column + public string GetInputCol() => _jvmObject.Invoke("getInputCol") as string; + + /// + /// Sets the column that the should read from. + /// + /// The name of the column to as the source. + /// with the input column set + public CountVectorizer SetInputCol(string value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setInputCol", value)); + + /// + /// The will create a new column in the DataFrame, this is + /// the name of the new column. + /// + /// The name of the output column. + public string GetOutputCol() => _jvmObject.Invoke("getOutputCol") as string; + + /// + /// The will create a new column in the DataFrame, this + /// is the name of the new column. + /// + /// The name of the output column which will be created. + /// New with the output column set + public CountVectorizer SetOutputCol(string value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setOutputCol", value)); + + /// + /// Gets the maximum number of different documents a term could appear in to be included in + /// the vocabulary. A term that appears more than the threshold will be ignored. If this is + /// an integer greater than or equal to 1, this specifies the maximum number of documents + /// the term could appear in; if this is a double in [0,1), then this specifies the maximum + /// fraction of documents the term could appear in. + /// + /// The maximum document term frequency + public double GetMaxDF() => (double)_jvmObject.Invoke("getMaxDF"); + + /// + /// Sets the maximum number of different documents a term could appear in to be included in + /// the vocabulary. A term that appears more than the threshold will be ignored. If this is + /// an integer greater than or equal to 1, this specifies the maximum number of documents + /// the term could appear in; if this is a double in [0,1), then this specifies the maximum + /// fraction of documents the term could appear in. + /// + /// The maximum document term frequency + /// New with the max df value set + public CountVectorizer SetMaxDF(double value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMaxDF", value)); + + /// + /// Gets the minimum number of different documents a term must appear in to be included in + /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the + /// number of documents the term must appear in; if this is a double in [0,1), then this + /// specifies the fraction of documents. + /// + /// The minimum document term frequency + public double GetMinDF() => (double)_jvmObject.Invoke("getMinDF"); + + /// + /// Sets the minimum number of different documents a term must appear in to be included in + /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the + /// number of documents the term must appear in; if this is a double in [0,1), then this + /// specifies the fraction of documents. + /// + /// The minimum document term frequency + /// New with the min df value set + public CountVectorizer SetMinDF(double value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMinDF", value)); + + /// + /// Filter to ignore rare words in a document. For each document, terms with + /// frequency/count less than the given threshold are ignored. If this is an integer + /// greater than or equal to 1, then this specifies a count (of times the term must appear + /// in the document); if this is a double in [0,1), then this specifies a fraction (out of + /// the document's token count). + /// + /// Note that the parameter is only used in transform of CountVectorizerModel and does not + /// affect fitting. + /// + /// Minimum term frequency + public double GetMinTF() => (double)_jvmObject.Invoke("getMinTF"); + + /// + /// Filter to ignore rare words in a document. For each document, terms with + /// frequency/count less than the given threshold are ignored. If this is an integer + /// greater than or equal to 1, then this specifies a count (of times the term must appear + /// in the document); if this is a double in [0,1), then this specifies a fraction (out of + /// the document's token count). + /// + /// Note that the parameter is only used in transform of CountVectorizerModel and does not + /// affect fitting. + /// + /// Minimum term frequency + /// New with the min term frequency set + public CountVectorizer SetMinTF(double value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMinTF", value)); + + /// + /// Gets the max size of the vocabulary. CountVectorizer will build a vocabulary that only + /// considers the top vocabSize terms ordered by term frequency across the corpus. + /// + /// The max size of the vocabulary + public int GetVocabSize() => (int)_jvmObject.Invoke("getVocabSize"); + + /// + /// Sets the max size of the vocabulary. will build a + /// vocabulary that only considers the top vocabSize terms ordered by term frequency across + /// the corpus. + /// + /// The max vocabulary size + /// with the max vocab value set + public CountVectorizer SetVocabSize(int value) => + WrapAsCountVectorizer(_jvmObject.Invoke("setVocabSize", value)); + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs new file mode 100644 index 000000000..8a6e427df --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs @@ -0,0 +1,170 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; + +namespace Microsoft.Spark.ML.Feature +{ + public class CountVectorizerModel : FeatureBase + , IJvmObjectReferenceProvider + { + private static readonly string s_countVectorizerModelClassName = + "org.apache.spark.ml.feature.CountVectorizerModel"; + + /// + /// Create a without any parameters + /// + /// The vocabulary to use + public CountVectorizerModel(List vocabulary) : + this(SparkEnvironment.JvmBridge.CallConstructor( + s_countVectorizerModelClassName, vocabulary)) + { + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + /// The vocabulary to use + public CountVectorizerModel(string uid, List vocabulary) : + this(SparkEnvironment.JvmBridge.CallConstructor( + s_countVectorizerModelClassName, uid, vocabulary)) + { + } + + internal CountVectorizerModel(JvmObjectReference jvmObject) : base(jvmObject) + { + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Loads the that was previously saved using Save + /// + /// + /// The path the previous was saved to + /// + /// New object + public static CountVectorizerModel Load(string path) => + WrapAsType((JvmObjectReference) + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_countVectorizerModelClassName,"load", path)); + + /// + /// Gets the binary toggle to control the output vector values. If True, all nonzero counts + /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic + /// models that model binary events rather than integer counts. Default: false + /// + /// boolean + public bool GetBinary() => (bool)_jvmObject.Invoke("getBinary"); + + /// + /// Sets the binary toggle to control the output vector values. If True, all nonzero counts + /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic + /// models that model binary events rather than integer counts. Default: false + /// + /// Turn the binary toggle on or off + /// + /// with the new binary toggle value set + /// + public CountVectorizerModel SetBinary(bool value) => + WrapAsCountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); + + private static CountVectorizerModel WrapAsCountVectorizerModel(object obj) => + new CountVectorizerModel((JvmObjectReference)obj); + + /// + /// Gets the column that the should read from and + /// convert into buckets. This would have been set by SetInputCol + /// + /// string, the input column + public string GetInputCol() => _jvmObject.Invoke("getInputCol") as string; + + /// + /// Sets the column that the should read from. + /// + /// The name of the column to as the source. + /// with the input column set + public CountVectorizerModel SetInputCol(string value) => + WrapAsCountVectorizerModel( + (JvmObjectReference)_jvmObject.Invoke("setInputCol", value)); + + /// + /// The will create a new column in the DataFrame, this + /// is the name of the new column. + /// + /// The name of the output column. + public string GetOutputCol() => _jvmObject.Invoke("getOutputCol") as string; + + /// + /// The will create a new column in the DataFrame, + /// this is the name of the new column. + /// + /// The name of the output column which will be created. + /// New with the output column set + public CountVectorizerModel SetOutputCol(string value) => + WrapAsCountVectorizerModel( + (JvmObjectReference)_jvmObject.Invoke("setOutputCol", value)); + + /// + /// Gets the maximum number of different documents a term could appear in to be included in + /// the vocabulary. A term that appears more than the threshold will be ignored. If this is + /// an integer greater than or equal to 1, this specifies the maximum number of documents + /// the term could appear in; if this is a double in [0,1), then this specifies the maximum + /// fraction of documents the term could appear in. + /// + /// The maximum document term frequency + public double GetMaxDF() => (double)_jvmObject.Invoke("getMaxDF"); + + /// + /// Gets the minimum number of different documents a term must appear in to be included in + /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the + /// number of documents the term must appear in; if this is a double in [0,1), then this + /// specifies the fraction of documents. + /// + /// The minimum document term frequency + public double GetMinDF() => (double)_jvmObject.Invoke("getMinDF"); + + /// + /// Filter to ignore rare words in a document. For each document, terms with + /// frequency/count less than the given threshold are ignored. If this is an integer + /// greater than or equal to 1, then this specifies a count (of times the term must appear + /// in the document); if this is a double in [0,1), then this specifies a fraction (out of + /// the document's token count). + /// + /// Note that the parameter is only used in transform of CountVectorizerModel and does not + /// affect fitting. + /// + /// Minimum term frequency + public double GetMinTF() => (double)_jvmObject.Invoke("getMinTF"); + + /// + /// Filter to ignore rare words in a document. For each document, terms with + /// frequency/count less than the given threshold are ignored. If this is an integer + /// greater than or equal to 1, then this specifies a count (of times the term must appear + /// in the document); if this is a double in [0,1), then this specifies a fraction (out of + /// the document's token count). + /// + /// Note that the parameter is only used in transform of CountVectorizerModel and does not + /// affect fitting. + /// + /// Minimum term frequency + /// + /// New with the min term frequency set + /// + public CountVectorizerModel SetMinTF(double value) => + WrapAsCountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("setMinTF", value)); + + /// + /// Gets the max size of the vocabulary. will build a + /// vocabulary that only considers the top vocabSize terms ordered by term frequency across + /// the corpus. + /// + /// The max size of the vocabulary + public int GetVocabSize() => (int)_jvmObject.Invoke("getVocabSize"); + } +} From e2a566b1f4b29775be9b57616a258802e294f304 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 27 Jul 2020 21:24:35 +0100 Subject: [PATCH 04/35] moving private methods to bottom --- src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs | 6 +++--- .../Microsoft.Spark/ML/Feature/CountVectorizerModel.cs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs index 41e0dbdd0..cf68f7c4a 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs @@ -71,9 +71,6 @@ public static CountVectorizer Load(string path) => public CountVectorizer SetBinary(bool value) => WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); - private static CountVectorizer WrapAsCountVectorizer(object obj) => - new CountVectorizer((JvmObjectReference)obj); - /// /// Gets the column that the should read from and convert /// into buckets. This would have been set by SetInputCol @@ -191,5 +188,8 @@ public CountVectorizer SetMinTF(double value) => /// with the max vocab value set public CountVectorizer SetVocabSize(int value) => WrapAsCountVectorizer(_jvmObject.Invoke("setVocabSize", value)); + + private static CountVectorizer WrapAsCountVectorizer(object obj) => + new CountVectorizer((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs index 8a6e427df..8e225a179 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs @@ -74,9 +74,6 @@ public static CountVectorizerModel Load(string path) => public CountVectorizerModel SetBinary(bool value) => WrapAsCountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); - private static CountVectorizerModel WrapAsCountVectorizerModel(object obj) => - new CountVectorizerModel((JvmObjectReference)obj); - /// /// Gets the column that the should read from and /// convert into buckets. This would have been set by SetInputCol @@ -166,5 +163,8 @@ public CountVectorizerModel SetMinTF(double value) => /// /// The max size of the vocabulary public int GetVocabSize() => (int)_jvmObject.Invoke("getVocabSize"); + + private static CountVectorizerModel WrapAsCountVectorizerModel(object obj) => + new CountVectorizerModel((JvmObjectReference)obj); } } From 5f682a601ec783f1609e6fd6e32c4d83ff1491d1 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Tue, 28 Jul 2020 20:47:31 +0100 Subject: [PATCH 05/35] changing wrap method --- src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs | 2 +- src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs index cf68f7c4a..b3fa0ef8a 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs @@ -49,7 +49,7 @@ public CountVectorizerModel Fit(DataFrame dataFrame) => /// /// New object public static CountVectorizer Load(string path) => - WrapAsType((JvmObjectReference) + WrapAsCountVectorizer((JvmObjectReference) SparkEnvironment.JvmBridge.CallStaticJavaMethod( s_countVectorizerClassName,"load", path)); diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs index 8e225a179..52bbd72c3 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs @@ -50,7 +50,7 @@ internal CountVectorizerModel(JvmObjectReference jvmObject) : base(jvmObject) /// /// New object public static CountVectorizerModel Load(string path) => - WrapAsType((JvmObjectReference) + WrapAsCountVectorizerModel((JvmObjectReference) SparkEnvironment.JvmBridge.CallStaticJavaMethod( s_countVectorizerModelClassName,"load", path)); From 31371db73b4faa653c07fdb8082e7aed02c0a031 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 18:45:46 +0100 Subject: [PATCH 06/35] setting min version required --- .../IpcTests/ML/Feature/CountVectorizerTests.cs | 14 ++++++++++---- .../Microsoft.Spark/ML/Feature/CountVectorizer.cs | 2 ++ .../Microsoft.Spark/ML/Feature/FeatureBase.cs | 3 ++- src/csharp/Microsoft.Spark/Microsoft.Spark.csproj | 5 +---- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs index d54bfe376..95b9bc504 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs @@ -4,6 +4,7 @@ using System; using System.IO; +using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; using Microsoft.Spark.UnitTest.TestUtils; @@ -30,7 +31,6 @@ public void Test_CountVectorizer() const string inputColumn = "input"; const string outputColumn = "output"; const double minDf = 1; - const double maxDf = 100; const double minTf = 10; const int vocabSize = 10000; const bool binary = false; @@ -41,7 +41,6 @@ public void Test_CountVectorizer() .SetInputCol(inputColumn) .SetOutputCol(outputColumn) .SetMinDF(minDf) - .SetMaxDF(maxDf) .SetMinTF(minTf) .SetVocabSize(vocabSize); @@ -49,7 +48,6 @@ public void Test_CountVectorizer() Assert.Equal(inputColumn, countVectorizer.GetInputCol()); Assert.Equal(outputColumn, countVectorizer.GetOutputCol()); Assert.Equal(minDf, countVectorizer.GetMinDF()); - Assert.Equal(maxDf, countVectorizer.GetMaxDF()); Assert.Equal(minTf, countVectorizer.GetMinTF()); Assert.Equal(vocabSize, countVectorizer.GetVocabSize()); Assert.Equal(binary, countVectorizer.GetBinary()); @@ -65,6 +63,14 @@ public void Test_CountVectorizer() Assert.NotEmpty(countVectorizer.ExplainParams()); Assert.NotEmpty(countVectorizer.ToString()); - } + } + + [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] + public void CountVectorizer_MaxDF() + { + const double maxDf = 100; + CountVectorizer countVectorizer = new CountVectorizer().SetMaxDF(maxDf); + Assert.Equal(maxDf, countVectorizer.GetMaxDF()); + } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs index b3fa0ef8a..5689e19fd 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs @@ -110,6 +110,7 @@ public CountVectorizer SetOutputCol(string value) => /// fraction of documents the term could appear in. /// /// The maximum document term frequency + [Since(Versions.V2_4_0)] public double GetMaxDF() => (double)_jvmObject.Invoke("getMaxDF"); /// @@ -121,6 +122,7 @@ public CountVectorizer SetOutputCol(string value) => /// /// The maximum document term frequency /// New with the max df value set + [Since(Versions.V2_4_0)] public CountVectorizer SetMaxDF(double value) => WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMaxDF", value)); diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index fcc90b43d..0895dace1 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -98,7 +98,7 @@ public Param.Param GetParam(string paramName) => public T Set(Param.Param param, object value) => WrapAsType((JvmObjectReference)_jvmObject.Invoke("set", param, value)); - private static T WrapAsType(JvmObjectReference reference) + internal static T WrapAsType(JvmObjectReference reference) { ConstructorInfo constructor = typeof(T) .GetConstructors(BindingFlags.NonPublic | BindingFlags.Instance) @@ -111,5 +111,6 @@ private static T WrapAsType(JvmObjectReference reference) return (T)constructor.Invoke(new object[] {reference}); } + } } diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index 2cddc5627..f284de8c6 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -38,10 +38,7 @@ - + From 60eb82f40ac37c553ca00a3ab4d0e404e4447dca Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 19:52:23 +0100 Subject: [PATCH 07/35] undoing csproj change --- .ionide/symbolCache.db | Bin 28672 -> 0 bytes .../Microsoft.Spark/Microsoft.Spark.csproj | 5 ++++- 2 files changed, 4 insertions(+), 1 deletion(-) delete mode 100644 .ionide/symbolCache.db diff --git a/.ionide/symbolCache.db b/.ionide/symbolCache.db deleted file mode 100644 index 43e567d6d682d85dd32b3baebb0fdf61f67c1643..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28672 zcmeHPYiuJ|6}A(f3n*{Ao>?UI&RXX3biC7$u_ zru*Uw_|aboAq4!aPz(G7Dp3^)2^9$msX|362qA<70{*l}EvWneB<{TIImtK)+7qdu zu{?L~_%Yvi&pr3fz2}^JGgp@L0vB1UR7NW^3^wa~*(5A|iH8H;*B z&*Jr7uND(C74gzvnY~{#(YNt3Bw$FukbofpLjr~b3<($#FeG3|z>t6;0j&hSxNf$G zdV0*S4h!s^BA3}J-Ki9L<b^F{6=TjCuvK9>U*;l97n^=RUn$l~UNgYJCMRsfNA1?9Bl`LCS%K#Ngzf z;{QO@KD+;){!jcL`9JXd#`A*vLwDKru4~cx6X&wy2aZ|$+xDBbpW9ZfKeZmWylYu) z{ax!q%P(3^n}21#VfwY{Wv%+^=L?ZG(qpDCr$c=8^*P6`^IVl5<5tIVd0~tSzgigM z?z5uk`LPT6Y_-By)&wRae!(ne*4gR?lUBdKT&?7)Y>8Rp+k4w%>fhA!qkaWU!g8j& z9avyP?TH38h17hd$}v3E&T>vpABO?_g&-RIX#2Phe6h%7MQ!I9p4+7FTpy5icQ=}> z549iWIkuWzms8^H1xPCcSS58?UH(Q%WgSo}pSi&1%ghFqw{V?jb6g|G_h<$0h{v%C z-gd4n!}2^=x>Jxh}w>8SGqHC9v|D(^V*GlIoc1w|iaru~I2##}Q_|T1nBeK3-|27DujYN}U*Qnog;yWG zZr^}f59*n?f&gs=trhybzAhapG&7CzWFvl0k6CDOn7FsU92`wI{g3@Pu)FM&(n0byiegJk8tp$;dZ(^ zv=Y$fNsAXqBh!KP@Nsv!7c*PDz?GP*+?q1NVCFOrK}H;XiU+ZH0EsJTZI6;T-JAq- zjuS+DU_#0}R28`M+eFlgwc9Ox&3NKZQZlZ_NMqF!>tue`EenJqeO= z-(k%EWxt9n4P*W<`j}){81sMGVPVYwoyPqCY2B>G{GWCz|I_n-YoKm$sAt63{IKZ@ zCjVW()Axq2+xxcnw&!C{$^DLd+4ZLDu=DedKRby1-}XDUcWqa#zqMYm{LFH`_1mrC zmLIi@o8L1(L;w7AFA|##XrEvtfaXaX7#xVzi>ihYNYj*Mww$X`*YV|QzC^=M?s7b{ zR2E&Ad_Jr7vJDN1UN$i$u~P>{b1*gdEEdg`lZyRFw(CoE4YZL{_BddI=VPNxf)tFtiS1c{kwLRP_Bs2KJFK{M=Zfgp z5MzaS^-RLY01>>A4J9(PJCPk;3|-Gg3h=}8Y*2oI=KR!=4YAJvd^~4-*c;Zwp=d)e zwB3Zp8E>CHASQ8d{J&ySn^K6#J;CrWR!`-^; zqM`|6+mM`(61?aElrp4!0${VlSjO{EDzvau3op=cAg;PpUaK$*T(-!H0bn9Ea6!6~ zfK+Z2jZ}ANN{^JVURgcM@|@U<%-5<_8pe2m6F=O3P0ZtfS{ltsMe8cM8#S4aNHRO7 zP>{8>qXSDzJ6)YVwwmL`gM=7R(3d8$>Y$^w!>jYuqOlTbI-J)^}tn~k4R`# z%gp&{VwO;uNmcVHV%BjK=nOZ4Rh#YB_F$tnC4XE!;#3Ygq;Hbn}0)EjsT`7(7Hm(^n4Shvcww9bHjiGUTkb|JUFBEjjap;AiR+{ zRfi)Sw-Q%wk3G;24hEwfM_e&LA1|CP7zp+@8d-fr1qb>{%Ti8k6mY>C>QgR<x&4FUUSXg<^HJJmg z+yu~gPaO(^pg4n>Feuf`;H>7b0t(b?!3eIh&iXMNlE!-ryH6Vpc*fwWGc*$D;%gTqk&q-Cl6 z&AhPQ$ki?Yc)V6Ocw}&wR=2ebVDZ#{FgVz|M=$;yfW}jQD)4qec<>o;`)ASWbD(2j zKwGjUe(ny9$7ZuaBe${y!12mP!8_>_%6uN&A8%dEKw#jIev4{6q2zTjkIn%KC!$yb zu1H5@5XzU1b*Mvx8ey0CdUjSW00%Hm3Vam6}SWN^JQ4~XCiyMMBOimJ2yEVw`3RFM``6MP+8rh)j(l45I9&m?x{pdOk1GyTW(8~A=7nOb-8>4TyfiALgBG{yP< zfN9y}f7L(ad(C&&`$zA~ooi^`*Tu02K?}xl$2k0 zbAGCma`TY_FI()QQ7l(SGLY<5OiOdr+o8!4+e1*~+h_)dnql67B-dwfHZZUiz zzabvn1HdDMHf^Ll5)zb@WM2U=c;2LBe==J4CfT-n1+TU#`8q*c)H!K{K&_voHgaIsS6#Dmy)4GV_@uWLz6 zCQj|6Ygy;mRq^1efCpaJLU|n1bS;|z2Cr*L98HWhx|R)qg4eYq#uF1dLT%c$kd6vq zg6CvS^dv?aO!Sley-p$ur@`DqgVL}*9Lx0e7KVs&&S^rGFGR!5--h~O1_78cOIj<(HBye+O7eATP3F@vFj0ZfVApl|kbfCFCq0p*~C8rNO| z5O|G}`oPsFL8V8~HcD~+AGF&{{x#oUeXn>w_CEI7Jzw?M-QRS_UEg(`cE0b-IsWD- z*gvut>CcRpApt`Ih6D@=7!vqDl0cTI`}N6#m<}|fmUX}y7k4Wa4`L?jmVGkJ6*QPm zYp1g@6xEwkmnZ@8xJSUAea_QpR|;ZeGEFy$x=)|HN*vD)pBhe0C^NlncK8B;yY%7BE>k_JC|$KEu;nbLo^u3l z$A=f=Oa&b*Noz^uG)L4O_hgO6Lz)T9gtSOo1n!-O7mYD7>XTNG)BbMWD7*yKTnn{p2kcErn^;5QO|^~@dfjb=g`(vO!rQ!^N%?io_SZ? zsF<0ra;!kxqpY{^imti@RPfs8^z`%@jfgc_8rKLgN6`#2wD@X-UA_4A%qCGc(UU6~ zuc#A&3DvB(O|fV<3^&xn+pKdkFx9uS8&=()>Y`TKtVBwi1a7T|SN5q>Q|Fbs-R8Wi r5i_V}JR8*1*wjg2b^;fqlb8T7-kvp6;i)FU08O@N<|Vu8nsWLNN`{i9 diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index f284de8c6..2cddc5627 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -38,7 +38,10 @@ - + From ed36375561e3495a675f9ac14ab80f79f3fbb38d Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 19:55:49 +0100 Subject: [PATCH 08/35] member doesnt need to be internal --- src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 0895dace1..8446b9f4e 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -98,7 +98,7 @@ public Param.Param GetParam(string paramName) => public T Set(Param.Param param, object value) => WrapAsType((JvmObjectReference)_jvmObject.Invoke("set", param, value)); - internal static T WrapAsType(JvmObjectReference reference) + private static T WrapAsType(JvmObjectReference reference) { ConstructorInfo constructor = typeof(T) .GetConstructors(BindingFlags.NonPublic | BindingFlags.Instance) From c7baf7231914b10300175e67158b604d646b97d4 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 19:56:29 +0100 Subject: [PATCH 09/35] too many lines --- src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 8446b9f4e..9ccd64d5b 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -106,11 +106,10 @@ private static T WrapAsType(JvmObjectReference reference) { ParameterInfo[] parameters = c.GetParameters(); return (parameters.Length == 1) && - (parameters[0].ParameterType == typeof(JvmObjectReference)); + (parameters[0].ParameterType == typeof(JvmObjectReference)); }); return (T)constructor.Invoke(new object[] {reference}); } - } } From d13303ccaeb691691c4d294d96e0995f3597becb Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 20:01:07 +0100 Subject: [PATCH 10/35] removing whitespace change --- src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 9ccd64d5b..326268a5e 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -105,7 +105,7 @@ private static T WrapAsType(JvmObjectReference reference) .Single(c => { ParameterInfo[] parameters = c.GetParameters(); - return (parameters.Length == 1) && + return (parameters.Length == 1) && (parameters[0].ParameterType == typeof(JvmObjectReference)); }); From f5b477c72158599b1c6552c7eb1af20edfab7779 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 20:01:57 +0100 Subject: [PATCH 11/35] removing whitespace change --- src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 326268a5e..9ccd64d5b 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -105,7 +105,7 @@ private static T WrapAsType(JvmObjectReference reference) .Single(c => { ParameterInfo[] parameters = c.GetParameters(); - return (parameters.Length == 1) && + return (parameters.Length == 1) && (parameters[0].ParameterType == typeof(JvmObjectReference)); }); From 73db52b400637585b2216f44aac616828800b9d2 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 20:06:12 +0100 Subject: [PATCH 12/35] ionide --- .ionide/symbolCache.db | Bin 0 -> 28672 bytes .../Microsoft.Spark/ML/Feature/FeatureBase.cs | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 .ionide/symbolCache.db diff --git a/.ionide/symbolCache.db b/.ionide/symbolCache.db new file mode 100644 index 0000000000000000000000000000000000000000..43e567d6d682d85dd32b3baebb0fdf61f67c1643 GIT binary patch literal 28672 zcmeHPYiuJ|6}A(f3n*{Ao>?UI&RXX3biC7$u_ zru*Uw_|aboAq4!aPz(G7Dp3^)2^9$msX|362qA<70{*l}EvWneB<{TIImtK)+7qdu zu{?L~_%Yvi&pr3fz2}^JGgp@L0vB1UR7NW^3^wa~*(5A|iH8H;*B z&*Jr7uND(C74gzvnY~{#(YNt3Bw$FukbofpLjr~b3<($#FeG3|z>t6;0j&hSxNf$G zdV0*S4h!s^BA3}J-Ki9L<b^F{6=TjCuvK9>U*;l97n^=RUn$l~UNgYJCMRsfNA1?9Bl`LCS%K#Ngzf z;{QO@KD+;){!jcL`9JXd#`A*vLwDKru4~cx6X&wy2aZ|$+xDBbpW9ZfKeZmWylYu) z{ax!q%P(3^n}21#VfwY{Wv%+^=L?ZG(qpDCr$c=8^*P6`^IVl5<5tIVd0~tSzgigM z?z5uk`LPT6Y_-By)&wRae!(ne*4gR?lUBdKT&?7)Y>8Rp+k4w%>fhA!qkaWU!g8j& z9avyP?TH38h17hd$}v3E&T>vpABO?_g&-RIX#2Phe6h%7MQ!I9p4+7FTpy5icQ=}> z549iWIkuWzms8^H1xPCcSS58?UH(Q%WgSo}pSi&1%ghFqw{V?jb6g|G_h<$0h{v%C z-gd4n!}2^=x>Jxh}w>8SGqHC9v|D(^V*GlIoc1w|iaru~I2##}Q_|T1nBeK3-|27DujYN}U*Qnog;yWG zZr^}f59*n?f&gs=trhybzAhapG&7CzWFvl0k6CDOn7FsU92`wI{g3@Pu)FM&(n0byiegJk8tp$;dZ(^ zv=Y$fNsAXqBh!KP@Nsv!7c*PDz?GP*+?q1NVCFOrK}H;XiU+ZH0EsJTZI6;T-JAq- zjuS+DU_#0}R28`M+eFlgwc9Ox&3NKZQZlZ_NMqF!>tue`EenJqeO= z-(k%EWxt9n4P*W<`j}){81sMGVPVYwoyPqCY2B>G{GWCz|I_n-YoKm$sAt63{IKZ@ zCjVW()Axq2+xxcnw&!C{$^DLd+4ZLDu=DedKRby1-}XDUcWqa#zqMYm{LFH`_1mrC zmLIi@o8L1(L;w7AFA|##XrEvtfaXaX7#xVzi>ihYNYj*Mww$X`*YV|QzC^=M?s7b{ zR2E&Ad_Jr7vJDN1UN$i$u~P>{b1*gdEEdg`lZyRFw(CoE4YZL{_BddI=VPNxf)tFtiS1c{kwLRP_Bs2KJFK{M=Zfgp z5MzaS^-RLY01>>A4J9(PJCPk;3|-Gg3h=}8Y*2oI=KR!=4YAJvd^~4-*c;Zwp=d)e zwB3Zp8E>CHASQ8d{J&ySn^K6#J;CrWR!`-^; zqM`|6+mM`(61?aElrp4!0${VlSjO{EDzvau3op=cAg;PpUaK$*T(-!H0bn9Ea6!6~ zfK+Z2jZ}ANN{^JVURgcM@|@U<%-5<_8pe2m6F=O3P0ZtfS{ltsMe8cM8#S4aNHRO7 zP>{8>qXSDzJ6)YVwwmL`gM=7R(3d8$>Y$^w!>jYuqOlTbI-J)^}tn~k4R`# z%gp&{VwO;uNmcVHV%BjK=nOZ4Rh#YB_F$tnC4XE!;#3Ygq;Hbn}0)EjsT`7(7Hm(^n4Shvcww9bHjiGUTkb|JUFBEjjap;AiR+{ zRfi)Sw-Q%wk3G;24hEwfM_e&LA1|CP7zp+@8d-fr1qb>{%Ti8k6mY>C>QgR<x&4FUUSXg<^HJJmg z+yu~gPaO(^pg4n>Feuf`;H>7b0t(b?!3eIh&iXMNlE!-ryH6Vpc*fwWGc*$D;%gTqk&q-Cl6 z&AhPQ$ki?Yc)V6Ocw}&wR=2ebVDZ#{FgVz|M=$;yfW}jQD)4qec<>o;`)ASWbD(2j zKwGjUe(ny9$7ZuaBe${y!12mP!8_>_%6uN&A8%dEKw#jIev4{6q2zTjkIn%KC!$yb zu1H5@5XzU1b*Mvx8ey0CdUjSW00%Hm3Vam6}SWN^JQ4~XCiyMMBOimJ2yEVw`3RFM``6MP+8rh)j(l45I9&m?x{pdOk1GyTW(8~A=7nOb-8>4TyfiALgBG{yP< zfN9y}f7L(ad(C&&`$zA~ooi^`*Tu02K?}xl$2k0 zbAGCma`TY_FI()QQ7l(SGLY<5OiOdr+o8!4+e1*~+h_)dnql67B-dwfHZZUiz zzabvn1HdDMHf^Ll5)zb@WM2U=c;2LBe==J4CfT-n1+TU#`8q*c)H!K{K&_voHgaIsS6#Dmy)4GV_@uWLz6 zCQj|6Ygy;mRq^1efCpaJLU|n1bS;|z2Cr*L98HWhx|R)qg4eYq#uF1dLT%c$kd6vq zg6CvS^dv?aO!Sley-p$ur@`DqgVL}*9Lx0e7KVs&&S^rGFGR!5--h~O1_78cOIj<(HBye+O7eATP3F@vFj0ZfVApl|kbfCFCq0p*~C8rNO| z5O|G}`oPsFL8V8~HcD~+AGF&{{x#oUeXn>w_CEI7Jzw?M-QRS_UEg(`cE0b-IsWD- z*gvut>CcRpApt`Ih6D@=7!vqDl0cTI`}N6#m<}|fmUX}y7k4Wa4`L?jmVGkJ6*QPm zYp1g@6xEwkmnZ@8xJSUAea_QpR|;ZeGEFy$x=)|HN*vD)pBhe0C^NlncK8B;yY%7BE>k_JC|$KEu;nbLo^u3l z$A=f=Oa&b*Noz^uG)L4O_hgO6Lz)T9gtSOo1n!-O7mYD7>XTNG)BbMWD7*yKTnn{p2kcErn^;5QO|^~@dfjb=g`(vO!rQ!^N%?io_SZ? zsF<0ra;!kxqpY{^imti@RPfs8^z`%@jfgc_8rKLgN6`#2wD@X-UA_4A%qCGc(UU6~ zuc#A&3DvB(O|fV<3^&xn+pKdkFx9uS8&=()>Y`TKtVBwi1a7T|SN5q>Q|Fbs-R8Wi r5i_V}JR8*1*wjg2b^;fqlb8T7-kvp6;i)FU08O@N<|Vu8nsWLNN`{i9 literal 0 HcmV?d00001 diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 9ccd64d5b..326268a5e 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -105,7 +105,7 @@ private static T WrapAsType(JvmObjectReference reference) .Single(c => { ParameterInfo[] parameters = c.GetParameters(); - return (parameters.Length == 1) && + return (parameters.Length == 1) && (parameters[0].ParameterType == typeof(JvmObjectReference)); }); From 8e1685cd270657c5e7a6769e732bf85d5ae6cb2e Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 13 Aug 2020 12:59:34 -0700 Subject: [PATCH 13/35] Revert "Merge branch 'master' into ml/countvectorizer" This reverts commit a766146f56014ccae4118b35495b84da588af94f, reversing changes made to 73db52b400637585b2216f44aac616828800b9d2. Reverting countvectorizer changes --- .gitignore | 3 --- .ionide/symbolCache.db | Bin 0 -> 28672 bytes .../Processor/BroadcastVariableProcessor.cs | 3 +-- 3 files changed, 1 insertion(+), 5 deletions(-) create mode 100644 .ionide/symbolCache.db diff --git a/.gitignore b/.gitignore index faada9c8a..251cfa7e2 100644 --- a/.gitignore +++ b/.gitignore @@ -367,6 +367,3 @@ hs_err_pid* # The target folder contains the output of building **/target/** - -# F# vs code -.ionide/ diff --git a/.ionide/symbolCache.db b/.ionide/symbolCache.db new file mode 100644 index 0000000000000000000000000000000000000000..43e567d6d682d85dd32b3baebb0fdf61f67c1643 GIT binary patch literal 28672 zcmeHPYiuJ|6}A(f3n*{Ao>?UI&RXX3biC7$u_ zru*Uw_|aboAq4!aPz(G7Dp3^)2^9$msX|362qA<70{*l}EvWneB<{TIImtK)+7qdu zu{?L~_%Yvi&pr3fz2}^JGgp@L0vB1UR7NW^3^wa~*(5A|iH8H;*B z&*Jr7uND(C74gzvnY~{#(YNt3Bw$FukbofpLjr~b3<($#FeG3|z>t6;0j&hSxNf$G zdV0*S4h!s^BA3}J-Ki9L<b^F{6=TjCuvK9>U*;l97n^=RUn$l~UNgYJCMRsfNA1?9Bl`LCS%K#Ngzf z;{QO@KD+;){!jcL`9JXd#`A*vLwDKru4~cx6X&wy2aZ|$+xDBbpW9ZfKeZmWylYu) z{ax!q%P(3^n}21#VfwY{Wv%+^=L?ZG(qpDCr$c=8^*P6`^IVl5<5tIVd0~tSzgigM z?z5uk`LPT6Y_-By)&wRae!(ne*4gR?lUBdKT&?7)Y>8Rp+k4w%>fhA!qkaWU!g8j& z9avyP?TH38h17hd$}v3E&T>vpABO?_g&-RIX#2Phe6h%7MQ!I9p4+7FTpy5icQ=}> z549iWIkuWzms8^H1xPCcSS58?UH(Q%WgSo}pSi&1%ghFqw{V?jb6g|G_h<$0h{v%C z-gd4n!}2^=x>Jxh}w>8SGqHC9v|D(^V*GlIoc1w|iaru~I2##}Q_|T1nBeK3-|27DujYN}U*Qnog;yWG zZr^}f59*n?f&gs=trhybzAhapG&7CzWFvl0k6CDOn7FsU92`wI{g3@Pu)FM&(n0byiegJk8tp$;dZ(^ zv=Y$fNsAXqBh!KP@Nsv!7c*PDz?GP*+?q1NVCFOrK}H;XiU+ZH0EsJTZI6;T-JAq- zjuS+DU_#0}R28`M+eFlgwc9Ox&3NKZQZlZ_NMqF!>tue`EenJqeO= z-(k%EWxt9n4P*W<`j}){81sMGVPVYwoyPqCY2B>G{GWCz|I_n-YoKm$sAt63{IKZ@ zCjVW()Axq2+xxcnw&!C{$^DLd+4ZLDu=DedKRby1-}XDUcWqa#zqMYm{LFH`_1mrC zmLIi@o8L1(L;w7AFA|##XrEvtfaXaX7#xVzi>ihYNYj*Mww$X`*YV|QzC^=M?s7b{ zR2E&Ad_Jr7vJDN1UN$i$u~P>{b1*gdEEdg`lZyRFw(CoE4YZL{_BddI=VPNxf)tFtiS1c{kwLRP_Bs2KJFK{M=Zfgp z5MzaS^-RLY01>>A4J9(PJCPk;3|-Gg3h=}8Y*2oI=KR!=4YAJvd^~4-*c;Zwp=d)e zwB3Zp8E>CHASQ8d{J&ySn^K6#J;CrWR!`-^; zqM`|6+mM`(61?aElrp4!0${VlSjO{EDzvau3op=cAg;PpUaK$*T(-!H0bn9Ea6!6~ zfK+Z2jZ}ANN{^JVURgcM@|@U<%-5<_8pe2m6F=O3P0ZtfS{ltsMe8cM8#S4aNHRO7 zP>{8>qXSDzJ6)YVwwmL`gM=7R(3d8$>Y$^w!>jYuqOlTbI-J)^}tn~k4R`# z%gp&{VwO;uNmcVHV%BjK=nOZ4Rh#YB_F$tnC4XE!;#3Ygq;Hbn}0)EjsT`7(7Hm(^n4Shvcww9bHjiGUTkb|JUFBEjjap;AiR+{ zRfi)Sw-Q%wk3G;24hEwfM_e&LA1|CP7zp+@8d-fr1qb>{%Ti8k6mY>C>QgR<x&4FUUSXg<^HJJmg z+yu~gPaO(^pg4n>Feuf`;H>7b0t(b?!3eIh&iXMNlE!-ryH6Vpc*fwWGc*$D;%gTqk&q-Cl6 z&AhPQ$ki?Yc)V6Ocw}&wR=2ebVDZ#{FgVz|M=$;yfW}jQD)4qec<>o;`)ASWbD(2j zKwGjUe(ny9$7ZuaBe${y!12mP!8_>_%6uN&A8%dEKw#jIev4{6q2zTjkIn%KC!$yb zu1H5@5XzU1b*Mvx8ey0CdUjSW00%Hm3Vam6}SWN^JQ4~XCiyMMBOimJ2yEVw`3RFM``6MP+8rh)j(l45I9&m?x{pdOk1GyTW(8~A=7nOb-8>4TyfiALgBG{yP< zfN9y}f7L(ad(C&&`$zA~ooi^`*Tu02K?}xl$2k0 zbAGCma`TY_FI()QQ7l(SGLY<5OiOdr+o8!4+e1*~+h_)dnql67B-dwfHZZUiz zzabvn1HdDMHf^Ll5)zb@WM2U=c;2LBe==J4CfT-n1+TU#`8q*c)H!K{K&_voHgaIsS6#Dmy)4GV_@uWLz6 zCQj|6Ygy;mRq^1efCpaJLU|n1bS;|z2Cr*L98HWhx|R)qg4eYq#uF1dLT%c$kd6vq zg6CvS^dv?aO!Sley-p$ur@`DqgVL}*9Lx0e7KVs&&S^rGFGR!5--h~O1_78cOIj<(HBye+O7eATP3F@vFj0ZfVApl|kbfCFCq0p*~C8rNO| z5O|G}`oPsFL8V8~HcD~+AGF&{{x#oUeXn>w_CEI7Jzw?M-QRS_UEg(`cE0b-IsWD- z*gvut>CcRpApt`Ih6D@=7!vqDl0cTI`}N6#m<}|fmUX}y7k4Wa4`L?jmVGkJ6*QPm zYp1g@6xEwkmnZ@8xJSUAea_QpR|;ZeGEFy$x=)|HN*vD)pBhe0C^NlncK8B;yY%7BE>k_JC|$KEu;nbLo^u3l z$A=f=Oa&b*Noz^uG)L4O_hgO6Lz)T9gtSOo1n!-O7mYD7>XTNG)BbMWD7*yKTnn{p2kcErn^;5QO|^~@dfjb=g`(vO!rQ!^N%?io_SZ? zsF<0ra;!kxqpY{^imti@RPfs8^z`%@jfgc_8rKLgN6`#2wD@X-UA_4A%qCGc(UU6~ zuc#A&3DvB(O|fV<3^&xn+pKdkFx9uS8&=()>Y`TKtVBwi1a7T|SN5q>Q|Fbs-R8Wi r5i_V}JR8*1*wjg2b^;fqlb8T7-kvp6;i)FU08O@N<|Vu8nsWLNN`{i9 literal 0 HcmV?d00001 diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index bf8f48ed8..41c817d02 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -54,8 +54,7 @@ internal BroadcastVariables Process(Stream stream) else { string path = SerDe.ReadString(stream); - using FileStream fStream = - File.Open(path, FileMode.Open, FileAccess.Read, FileShare.Read); + using FileStream fStream = File.Open(path, FileMode.Open, FileAccess.Read); object value = formatter.Deserialize(fStream); BroadcastRegistry.Add(bid, value); } From 255515eecbd6cb8e7919fbd2b857d99e335c66d2 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 13 Aug 2020 13:04:05 -0700 Subject: [PATCH 14/35] Revert "Merge branch 'ml/countvectorizer' of https://github.com/GoEddie/spark" This reverts commit ad6bcede69de012c22178825e76c6b175c770b8f, reversing changes made to 4c5d502a9f56e79ea071b12d2a49dced3873dea8. reverting countvectorizer changes -2 --- .../ML/Feature/CountVectorizerModelTests.cs | 73 ------- .../ML/Feature/CountVectorizerTests.cs | 76 ------- .../ML/Feature/CountVectorizer.cs | 197 ------------------ .../ML/Feature/CountVectorizerModel.cs | 170 --------------- .../Microsoft.Spark/ML/Feature/FeatureBase.cs | 4 +- 5 files changed, 2 insertions(+), 518 deletions(-) delete mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs delete mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs delete mode 100644 src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs delete mode 100644 src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs deleted file mode 100644 index 3c3132dd9..000000000 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs +++ /dev/null @@ -1,73 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.IO; -using Microsoft.Spark.ML.Feature; -using Microsoft.Spark.Sql; -using Microsoft.Spark.UnitTest.TestUtils; -using Xunit; - -namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature -{ - [Collection("Spark E2E Tests")] - public class CountVectorizerModelTests - { - private readonly SparkSession _spark; - - public CountVectorizerModelTests(SparkFixture fixture) - { - _spark = fixture.Spark; - } - - [Fact] - public void Test_CountVectorizerModel() - { - DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " + - "'TOKENIZE') as input from range(100)"); - - const string inputColumn = "input"; - const string outputColumn = "output"; - const double minTf = 10.0; - const bool binary = false; - - List vocabulary = new List() - { - "hello", - "I", - "AM", - "TO", - "TOKENIZE" - }; - - var countVectorizerModel = new CountVectorizerModel(vocabulary); - - Assert.IsType(new CountVectorizerModel("my-uid", vocabulary)); - - countVectorizerModel = countVectorizerModel - .SetInputCol(inputColumn) - .SetOutputCol(outputColumn) - .SetMinTF(minTf) - .SetBinary(binary); - - Assert.Equal(inputColumn, countVectorizerModel.GetInputCol()); - Assert.Equal(outputColumn, countVectorizerModel.GetOutputCol()); - Assert.Equal(minTf, countVectorizerModel.GetMinTF()); - Assert.Equal(binary, countVectorizerModel.GetBinary()); - using (var tempDirectory = new TemporaryDirectory()) - { - string savePath = Path.Join(tempDirectory.Path, "countVectorizerModel"); - countVectorizerModel.Save(savePath); - - CountVectorizerModel loadedModel = CountVectorizerModel.Load(savePath); - Assert.Equal(countVectorizerModel.Uid(), loadedModel.Uid()); - } - - Assert.IsType(countVectorizerModel.GetVocabSize()); - Assert.NotEmpty(countVectorizerModel.ExplainParams()); - Assert.NotEmpty(countVectorizerModel.ToString()); - } - } -} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs deleted file mode 100644 index 95b9bc504..000000000 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.IO; -using Microsoft.Spark.E2ETest.Utils; -using Microsoft.Spark.ML.Feature; -using Microsoft.Spark.Sql; -using Microsoft.Spark.UnitTest.TestUtils; -using Xunit; - -namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature -{ - [Collection("Spark E2E Tests")] - public class CountVectorizerTests - { - private readonly SparkSession _spark; - - public CountVectorizerTests(SparkFixture fixture) - { - _spark = fixture.Spark; - } - - [Fact] - public void Test_CountVectorizer() - { - DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " + - "'TOKENIZE') as input from range(100)"); - - const string inputColumn = "input"; - const string outputColumn = "output"; - const double minDf = 1; - const double minTf = 10; - const int vocabSize = 10000; - const bool binary = false; - - var countVectorizer = new CountVectorizer(); - - countVectorizer - .SetInputCol(inputColumn) - .SetOutputCol(outputColumn) - .SetMinDF(minDf) - .SetMinTF(minTf) - .SetVocabSize(vocabSize); - - Assert.IsType(countVectorizer.Fit(input)); - Assert.Equal(inputColumn, countVectorizer.GetInputCol()); - Assert.Equal(outputColumn, countVectorizer.GetOutputCol()); - Assert.Equal(minDf, countVectorizer.GetMinDF()); - Assert.Equal(minTf, countVectorizer.GetMinTF()); - Assert.Equal(vocabSize, countVectorizer.GetVocabSize()); - Assert.Equal(binary, countVectorizer.GetBinary()); - - using (var tempDirectory = new TemporaryDirectory()) - { - string savePath = Path.Join(tempDirectory.Path, "countVectorizer"); - countVectorizer.Save(savePath); - - CountVectorizer loadedVectorizer = CountVectorizer.Load(savePath); - Assert.Equal(countVectorizer.Uid(), loadedVectorizer.Uid()); - } - - Assert.NotEmpty(countVectorizer.ExplainParams()); - Assert.NotEmpty(countVectorizer.ToString()); - } - - [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] - public void CountVectorizer_MaxDF() - { - const double maxDf = 100; - CountVectorizer countVectorizer = new CountVectorizer().SetMaxDF(maxDf); - Assert.Equal(maxDf, countVectorizer.GetMaxDF()); - } - } -} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs deleted file mode 100644 index 5689e19fd..000000000 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs +++ /dev/null @@ -1,197 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.Spark.Interop; -using Microsoft.Spark.Interop.Ipc; -using Microsoft.Spark.Sql; - -namespace Microsoft.Spark.ML.Feature -{ - public class CountVectorizer : FeatureBase, IJvmObjectReferenceProvider - { - private static readonly string s_countVectorizerClassName = - "org.apache.spark.ml.feature.CountVectorizer"; - - /// - /// Create a without any parameters - /// - public CountVectorizer() : base(s_countVectorizerClassName) - { - } - - /// - /// Create a with a UID that is used to give the - /// a unique ID - /// - /// An immutable unique ID for the object and its derivatives. - public CountVectorizer(string uid) : base(s_countVectorizerClassName, uid) - { - } - - internal CountVectorizer(JvmObjectReference jvmObject) : base(jvmObject) - { - } - - JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; - - /// Fits a model to the input data. - /// The to fit the model to. - /// - public CountVectorizerModel Fit(DataFrame dataFrame) => - new CountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("fit", dataFrame)); - - /// - /// Loads the that was previously saved using Save - /// - /// - /// The path the previous was saved to - /// - /// New object - public static CountVectorizer Load(string path) => - WrapAsCountVectorizer((JvmObjectReference) - SparkEnvironment.JvmBridge.CallStaticJavaMethod( - s_countVectorizerClassName,"load", path)); - - /// - /// Gets the binary toggle to control the output vector values. If True, all nonzero counts - /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic - /// models that model binary events rather than integer counts. Default: false - /// - /// boolean - public bool GetBinary() => (bool)_jvmObject.Invoke("getBinary"); - - /// - /// Sets the binary toggle to control the output vector values. If True, all nonzero counts - /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic - /// models that model binary events rather than integer counts. Default: false - /// - /// Turn the binary toggle on or off - /// with the new binary toggle value set - public CountVectorizer SetBinary(bool value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); - - /// - /// Gets the column that the should read from and convert - /// into buckets. This would have been set by SetInputCol - /// - /// string, the input column - public string GetInputCol() => _jvmObject.Invoke("getInputCol") as string; - - /// - /// Sets the column that the should read from. - /// - /// The name of the column to as the source. - /// with the input column set - public CountVectorizer SetInputCol(string value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setInputCol", value)); - - /// - /// The will create a new column in the DataFrame, this is - /// the name of the new column. - /// - /// The name of the output column. - public string GetOutputCol() => _jvmObject.Invoke("getOutputCol") as string; - - /// - /// The will create a new column in the DataFrame, this - /// is the name of the new column. - /// - /// The name of the output column which will be created. - /// New with the output column set - public CountVectorizer SetOutputCol(string value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setOutputCol", value)); - - /// - /// Gets the maximum number of different documents a term could appear in to be included in - /// the vocabulary. A term that appears more than the threshold will be ignored. If this is - /// an integer greater than or equal to 1, this specifies the maximum number of documents - /// the term could appear in; if this is a double in [0,1), then this specifies the maximum - /// fraction of documents the term could appear in. - /// - /// The maximum document term frequency - [Since(Versions.V2_4_0)] - public double GetMaxDF() => (double)_jvmObject.Invoke("getMaxDF"); - - /// - /// Sets the maximum number of different documents a term could appear in to be included in - /// the vocabulary. A term that appears more than the threshold will be ignored. If this is - /// an integer greater than or equal to 1, this specifies the maximum number of documents - /// the term could appear in; if this is a double in [0,1), then this specifies the maximum - /// fraction of documents the term could appear in. - /// - /// The maximum document term frequency - /// New with the max df value set - [Since(Versions.V2_4_0)] - public CountVectorizer SetMaxDF(double value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMaxDF", value)); - - /// - /// Gets the minimum number of different documents a term must appear in to be included in - /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the - /// number of documents the term must appear in; if this is a double in [0,1), then this - /// specifies the fraction of documents. - /// - /// The minimum document term frequency - public double GetMinDF() => (double)_jvmObject.Invoke("getMinDF"); - - /// - /// Sets the minimum number of different documents a term must appear in to be included in - /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the - /// number of documents the term must appear in; if this is a double in [0,1), then this - /// specifies the fraction of documents. - /// - /// The minimum document term frequency - /// New with the min df value set - public CountVectorizer SetMinDF(double value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMinDF", value)); - - /// - /// Filter to ignore rare words in a document. For each document, terms with - /// frequency/count less than the given threshold are ignored. If this is an integer - /// greater than or equal to 1, then this specifies a count (of times the term must appear - /// in the document); if this is a double in [0,1), then this specifies a fraction (out of - /// the document's token count). - /// - /// Note that the parameter is only used in transform of CountVectorizerModel and does not - /// affect fitting. - /// - /// Minimum term frequency - public double GetMinTF() => (double)_jvmObject.Invoke("getMinTF"); - - /// - /// Filter to ignore rare words in a document. For each document, terms with - /// frequency/count less than the given threshold are ignored. If this is an integer - /// greater than or equal to 1, then this specifies a count (of times the term must appear - /// in the document); if this is a double in [0,1), then this specifies a fraction (out of - /// the document's token count). - /// - /// Note that the parameter is only used in transform of CountVectorizerModel and does not - /// affect fitting. - /// - /// Minimum term frequency - /// New with the min term frequency set - public CountVectorizer SetMinTF(double value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMinTF", value)); - - /// - /// Gets the max size of the vocabulary. CountVectorizer will build a vocabulary that only - /// considers the top vocabSize terms ordered by term frequency across the corpus. - /// - /// The max size of the vocabulary - public int GetVocabSize() => (int)_jvmObject.Invoke("getVocabSize"); - - /// - /// Sets the max size of the vocabulary. will build a - /// vocabulary that only considers the top vocabSize terms ordered by term frequency across - /// the corpus. - /// - /// The max vocabulary size - /// with the max vocab value set - public CountVectorizer SetVocabSize(int value) => - WrapAsCountVectorizer(_jvmObject.Invoke("setVocabSize", value)); - - private static CountVectorizer WrapAsCountVectorizer(object obj) => - new CountVectorizer((JvmObjectReference)obj); - } -} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs deleted file mode 100644 index 52bbd72c3..000000000 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs +++ /dev/null @@ -1,170 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using Microsoft.Spark.Interop; -using Microsoft.Spark.Interop.Ipc; - -namespace Microsoft.Spark.ML.Feature -{ - public class CountVectorizerModel : FeatureBase - , IJvmObjectReferenceProvider - { - private static readonly string s_countVectorizerModelClassName = - "org.apache.spark.ml.feature.CountVectorizerModel"; - - /// - /// Create a without any parameters - /// - /// The vocabulary to use - public CountVectorizerModel(List vocabulary) : - this(SparkEnvironment.JvmBridge.CallConstructor( - s_countVectorizerModelClassName, vocabulary)) - { - } - - /// - /// Create a with a UID that is used to give the - /// a unique ID - /// - /// An immutable unique ID for the object and its derivatives. - /// The vocabulary to use - public CountVectorizerModel(string uid, List vocabulary) : - this(SparkEnvironment.JvmBridge.CallConstructor( - s_countVectorizerModelClassName, uid, vocabulary)) - { - } - - internal CountVectorizerModel(JvmObjectReference jvmObject) : base(jvmObject) - { - } - - JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; - - /// - /// Loads the that was previously saved using Save - /// - /// - /// The path the previous was saved to - /// - /// New object - public static CountVectorizerModel Load(string path) => - WrapAsCountVectorizerModel((JvmObjectReference) - SparkEnvironment.JvmBridge.CallStaticJavaMethod( - s_countVectorizerModelClassName,"load", path)); - - /// - /// Gets the binary toggle to control the output vector values. If True, all nonzero counts - /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic - /// models that model binary events rather than integer counts. Default: false - /// - /// boolean - public bool GetBinary() => (bool)_jvmObject.Invoke("getBinary"); - - /// - /// Sets the binary toggle to control the output vector values. If True, all nonzero counts - /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic - /// models that model binary events rather than integer counts. Default: false - /// - /// Turn the binary toggle on or off - /// - /// with the new binary toggle value set - /// - public CountVectorizerModel SetBinary(bool value) => - WrapAsCountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); - - /// - /// Gets the column that the should read from and - /// convert into buckets. This would have been set by SetInputCol - /// - /// string, the input column - public string GetInputCol() => _jvmObject.Invoke("getInputCol") as string; - - /// - /// Sets the column that the should read from. - /// - /// The name of the column to as the source. - /// with the input column set - public CountVectorizerModel SetInputCol(string value) => - WrapAsCountVectorizerModel( - (JvmObjectReference)_jvmObject.Invoke("setInputCol", value)); - - /// - /// The will create a new column in the DataFrame, this - /// is the name of the new column. - /// - /// The name of the output column. - public string GetOutputCol() => _jvmObject.Invoke("getOutputCol") as string; - - /// - /// The will create a new column in the DataFrame, - /// this is the name of the new column. - /// - /// The name of the output column which will be created. - /// New with the output column set - public CountVectorizerModel SetOutputCol(string value) => - WrapAsCountVectorizerModel( - (JvmObjectReference)_jvmObject.Invoke("setOutputCol", value)); - - /// - /// Gets the maximum number of different documents a term could appear in to be included in - /// the vocabulary. A term that appears more than the threshold will be ignored. If this is - /// an integer greater than or equal to 1, this specifies the maximum number of documents - /// the term could appear in; if this is a double in [0,1), then this specifies the maximum - /// fraction of documents the term could appear in. - /// - /// The maximum document term frequency - public double GetMaxDF() => (double)_jvmObject.Invoke("getMaxDF"); - - /// - /// Gets the minimum number of different documents a term must appear in to be included in - /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the - /// number of documents the term must appear in; if this is a double in [0,1), then this - /// specifies the fraction of documents. - /// - /// The minimum document term frequency - public double GetMinDF() => (double)_jvmObject.Invoke("getMinDF"); - - /// - /// Filter to ignore rare words in a document. For each document, terms with - /// frequency/count less than the given threshold are ignored. If this is an integer - /// greater than or equal to 1, then this specifies a count (of times the term must appear - /// in the document); if this is a double in [0,1), then this specifies a fraction (out of - /// the document's token count). - /// - /// Note that the parameter is only used in transform of CountVectorizerModel and does not - /// affect fitting. - /// - /// Minimum term frequency - public double GetMinTF() => (double)_jvmObject.Invoke("getMinTF"); - - /// - /// Filter to ignore rare words in a document. For each document, terms with - /// frequency/count less than the given threshold are ignored. If this is an integer - /// greater than or equal to 1, then this specifies a count (of times the term must appear - /// in the document); if this is a double in [0,1), then this specifies a fraction (out of - /// the document's token count). - /// - /// Note that the parameter is only used in transform of CountVectorizerModel and does not - /// affect fitting. - /// - /// Minimum term frequency - /// - /// New with the min term frequency set - /// - public CountVectorizerModel SetMinTF(double value) => - WrapAsCountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("setMinTF", value)); - - /// - /// Gets the max size of the vocabulary. will build a - /// vocabulary that only considers the top vocabSize terms ordered by term frequency across - /// the corpus. - /// - /// The max size of the vocabulary - public int GetVocabSize() => (int)_jvmObject.Invoke("getVocabSize"); - - private static CountVectorizerModel WrapAsCountVectorizerModel(object obj) => - new CountVectorizerModel((JvmObjectReference)obj); - } -} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 326268a5e..fcc90b43d 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -105,8 +105,8 @@ private static T WrapAsType(JvmObjectReference reference) .Single(c => { ParameterInfo[] parameters = c.GetParameters(); - return (parameters.Length == 1) && - (parameters[0].ParameterType == typeof(JvmObjectReference)); + return (parameters.Length == 1) && + (parameters[0].ParameterType == typeof(JvmObjectReference)); }); return (T)constructor.Invoke(new object[] {reference}); From 3c2c936b007d7b5d761fda737625dc8f7d03728b Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 14 Aug 2020 13:32:54 -0700 Subject: [PATCH 15/35] fixing merge errors --- .gitignore | 3 +++ .../Processor/BroadcastVariableProcessor.cs | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 251cfa7e2..8e67b5699 100644 --- a/.gitignore +++ b/.gitignore @@ -367,3 +367,6 @@ hs_err_pid* # The target folder contains the output of building **/target/** + +# F# vs code +.ionide/ \ No newline at end of file diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index 41c817d02..bf8f48ed8 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -54,7 +54,8 @@ internal BroadcastVariables Process(Stream stream) else { string path = SerDe.ReadString(stream); - using FileStream fStream = File.Open(path, FileMode.Open, FileAccess.Read); + using FileStream fStream = + File.Open(path, FileMode.Open, FileAccess.Read, FileShare.Read); object value = formatter.Deserialize(fStream); BroadcastRegistry.Add(bid, value); } From 88e834d53b7be8931147a095a7b0df3c08cd9aa8 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 19 Aug 2020 19:24:14 -0700 Subject: [PATCH 16/35] removing ionid --- .gitignore | 2 +- .ionide/symbolCache.db | Bin 28672 -> 0 bytes 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 .ionide/symbolCache.db diff --git a/.gitignore b/.gitignore index 8e67b5699..faada9c8a 100644 --- a/.gitignore +++ b/.gitignore @@ -369,4 +369,4 @@ hs_err_pid* **/target/** # F# vs code -.ionide/ \ No newline at end of file +.ionide/ diff --git a/.ionide/symbolCache.db b/.ionide/symbolCache.db deleted file mode 100644 index 43e567d6d682d85dd32b3baebb0fdf61f67c1643..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28672 zcmeHPYiuJ|6}A(f3n*{Ao>?UI&RXX3biC7$u_ zru*Uw_|aboAq4!aPz(G7Dp3^)2^9$msX|362qA<70{*l}EvWneB<{TIImtK)+7qdu zu{?L~_%Yvi&pr3fz2}^JGgp@L0vB1UR7NW^3^wa~*(5A|iH8H;*B z&*Jr7uND(C74gzvnY~{#(YNt3Bw$FukbofpLjr~b3<($#FeG3|z>t6;0j&hSxNf$G zdV0*S4h!s^BA3}J-Ki9L<b^F{6=TjCuvK9>U*;l97n^=RUn$l~UNgYJCMRsfNA1?9Bl`LCS%K#Ngzf z;{QO@KD+;){!jcL`9JXd#`A*vLwDKru4~cx6X&wy2aZ|$+xDBbpW9ZfKeZmWylYu) z{ax!q%P(3^n}21#VfwY{Wv%+^=L?ZG(qpDCr$c=8^*P6`^IVl5<5tIVd0~tSzgigM z?z5uk`LPT6Y_-By)&wRae!(ne*4gR?lUBdKT&?7)Y>8Rp+k4w%>fhA!qkaWU!g8j& z9avyP?TH38h17hd$}v3E&T>vpABO?_g&-RIX#2Phe6h%7MQ!I9p4+7FTpy5icQ=}> z549iWIkuWzms8^H1xPCcSS58?UH(Q%WgSo}pSi&1%ghFqw{V?jb6g|G_h<$0h{v%C z-gd4n!}2^=x>Jxh}w>8SGqHC9v|D(^V*GlIoc1w|iaru~I2##}Q_|T1nBeK3-|27DujYN}U*Qnog;yWG zZr^}f59*n?f&gs=trhybzAhapG&7CzWFvl0k6CDOn7FsU92`wI{g3@Pu)FM&(n0byiegJk8tp$;dZ(^ zv=Y$fNsAXqBh!KP@Nsv!7c*PDz?GP*+?q1NVCFOrK}H;XiU+ZH0EsJTZI6;T-JAq- zjuS+DU_#0}R28`M+eFlgwc9Ox&3NKZQZlZ_NMqF!>tue`EenJqeO= z-(k%EWxt9n4P*W<`j}){81sMGVPVYwoyPqCY2B>G{GWCz|I_n-YoKm$sAt63{IKZ@ zCjVW()Axq2+xxcnw&!C{$^DLd+4ZLDu=DedKRby1-}XDUcWqa#zqMYm{LFH`_1mrC zmLIi@o8L1(L;w7AFA|##XrEvtfaXaX7#xVzi>ihYNYj*Mww$X`*YV|QzC^=M?s7b{ zR2E&Ad_Jr7vJDN1UN$i$u~P>{b1*gdEEdg`lZyRFw(CoE4YZL{_BddI=VPNxf)tFtiS1c{kwLRP_Bs2KJFK{M=Zfgp z5MzaS^-RLY01>>A4J9(PJCPk;3|-Gg3h=}8Y*2oI=KR!=4YAJvd^~4-*c;Zwp=d)e zwB3Zp8E>CHASQ8d{J&ySn^K6#J;CrWR!`-^; zqM`|6+mM`(61?aElrp4!0${VlSjO{EDzvau3op=cAg;PpUaK$*T(-!H0bn9Ea6!6~ zfK+Z2jZ}ANN{^JVURgcM@|@U<%-5<_8pe2m6F=O3P0ZtfS{ltsMe8cM8#S4aNHRO7 zP>{8>qXSDzJ6)YVwwmL`gM=7R(3d8$>Y$^w!>jYuqOlTbI-J)^}tn~k4R`# z%gp&{VwO;uNmcVHV%BjK=nOZ4Rh#YB_F$tnC4XE!;#3Ygq;Hbn}0)EjsT`7(7Hm(^n4Shvcww9bHjiGUTkb|JUFBEjjap;AiR+{ zRfi)Sw-Q%wk3G;24hEwfM_e&LA1|CP7zp+@8d-fr1qb>{%Ti8k6mY>C>QgR<x&4FUUSXg<^HJJmg z+yu~gPaO(^pg4n>Feuf`;H>7b0t(b?!3eIh&iXMNlE!-ryH6Vpc*fwWGc*$D;%gTqk&q-Cl6 z&AhPQ$ki?Yc)V6Ocw}&wR=2ebVDZ#{FgVz|M=$;yfW}jQD)4qec<>o;`)ASWbD(2j zKwGjUe(ny9$7ZuaBe${y!12mP!8_>_%6uN&A8%dEKw#jIev4{6q2zTjkIn%KC!$yb zu1H5@5XzU1b*Mvx8ey0CdUjSW00%Hm3Vam6}SWN^JQ4~XCiyMMBOimJ2yEVw`3RFM``6MP+8rh)j(l45I9&m?x{pdOk1GyTW(8~A=7nOb-8>4TyfiALgBG{yP< zfN9y}f7L(ad(C&&`$zA~ooi^`*Tu02K?}xl$2k0 zbAGCma`TY_FI()QQ7l(SGLY<5OiOdr+o8!4+e1*~+h_)dnql67B-dwfHZZUiz zzabvn1HdDMHf^Ll5)zb@WM2U=c;2LBe==J4CfT-n1+TU#`8q*c)H!K{K&_voHgaIsS6#Dmy)4GV_@uWLz6 zCQj|6Ygy;mRq^1efCpaJLU|n1bS;|z2Cr*L98HWhx|R)qg4eYq#uF1dLT%c$kd6vq zg6CvS^dv?aO!Sley-p$ur@`DqgVL}*9Lx0e7KVs&&S^rGFGR!5--h~O1_78cOIj<(HBye+O7eATP3F@vFj0ZfVApl|kbfCFCq0p*~C8rNO| z5O|G}`oPsFL8V8~HcD~+AGF&{{x#oUeXn>w_CEI7Jzw?M-QRS_UEg(`cE0b-IsWD- z*gvut>CcRpApt`Ih6D@=7!vqDl0cTI`}N6#m<}|fmUX}y7k4Wa4`L?jmVGkJ6*QPm zYp1g@6xEwkmnZ@8xJSUAea_QpR|;ZeGEFy$x=)|HN*vD)pBhe0C^NlncK8B;yY%7BE>k_JC|$KEu;nbLo^u3l z$A=f=Oa&b*Noz^uG)L4O_hgO6Lz)T9gtSOo1n!-O7mYD7>XTNG)BbMWD7*yKTnn{p2kcErn^;5QO|^~@dfjb=g`(vO!rQ!^N%?io_SZ? zsF<0ra;!kxqpY{^imti@RPfs8^z`%@jfgc_8rKLgN6`#2wD@X-UA_4A%qCGc(UU6~ zuc#A&3DvB(O|fV<3^&xn+pKdkFx9uS8&=()>Y`TKtVBwi1a7T|SN5q>Q|Fbs-R8Wi r5i_V}JR8*1*wjg2b^;fqlb8T7-kvp6;i)FU08O@N<|Vu8nsWLNN`{i9 From 2cd9a2a4c99c4d690d7053fe884f858eafad1397 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 01:28:39 -0700 Subject: [PATCH 17/35] First commit --- .../Microsoft.Spark/Sql/DataFrameWriterV2.cs | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs diff --git a/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs b/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs new file mode 100644 index 000000000..9683c6bdc --- /dev/null +++ b/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs @@ -0,0 +1,49 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.Spark.Interop.Internal.Java.Util; +using Microsoft.Spark.Interop.Ipc; + +namespace Microsoft.Spark.Sql +{ + /// + /// Interface used to write a [[org.apache.spark.sql.Dataset]] to external storage using the v2 + /// API. + /// + [Since(Versions.V3_0_0)] + public sealed class DataFrameWriterV2 : IJvmObjectReferenceProvider + { + private readonly JvmObjectReference _jvmObject; + + internal DataFrameWriterV2(JvmObjectReference jvmObject) => _jvmObject = jvmObject; + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Adds an output option for the underlying data source. + /// + /// Name of the option + /// Value of the option + /// This DataFrameWriterV2 object + public DataFrameWriterV2 Option(string key, string value) + { + _jvmObject.Invoke("option", key, value); + return this; + } + + /// + /// Adds output options for the underlying data source. + /// + /// Key/value options + /// This DataFrameWriterV2 object + public DataFrameWriterV2 Options(Dictionary options) + { + _jvmObject.Invoke("options", options); + return this; + } + + + } +} From abea46aba10068187dbae52a02586a917ec34b9f Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 03:20:49 -0700 Subject: [PATCH 18/35] exposing public APIs --- .../Microsoft.Spark/Sql/DataFrameWriterV2.cs | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs b/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs index 9683c6bdc..99dc0a074 100644 --- a/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs +++ b/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs @@ -44,6 +44,42 @@ public DataFrameWriterV2 Options(Dictionary options) return this; } - + public void Create() + { + _jvmObject.Invoke("create"); + } + + public void Replace() + { + _jvmObject.Invoke("replace"); + } + + public void CreateOrReplace() + { + _jvmObject.Invoke("createOrReplace"); + } + + /// + /// Append the contents of the data frame to the output table. + /// + public void Append() + { + _jvmObject.Invoke("append"); + } + + /// + /// Overwrite rows matching the given filter condition with the contents of the data frame + /// in the output table. + /// + /// + public void Overwrite(Column condition) + { + _jvmObject.Invoke("overwrite", condition); + } + + public void OverwritePartitions() + { + _jvmObject.Invoke("overwritePartitions"); + } } } From 07fbfaa94c1fdbecb77f21b087dc4446deb04cd2 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 16:20:14 -0700 Subject: [PATCH 19/35] changes --- src/csharp/Microsoft.Spark/Sql/DataFrame.cs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/csharp/Microsoft.Spark/Sql/DataFrame.cs b/src/csharp/Microsoft.Spark/Sql/DataFrame.cs index 587b3d351..44fe3c7a1 100644 --- a/src/csharp/Microsoft.Spark/Sql/DataFrame.cs +++ b/src/csharp/Microsoft.Spark/Sql/DataFrame.cs @@ -535,6 +535,14 @@ public DataFrame Agg(Column expr, params Column[] exprs) => public DataFrame Observe(string name, Column expr, params Column[] exprs) => WrapAsDataFrame(_jvmObject.Invoke("observe", name, expr, exprs)); + /// + /// Create a write configuration builder for v2 sources. + /// + /// Name of table to write to + /// DataFrameWriterV2 object + public DataFrameWriterV2 WriteTo(string table) => + WrapAsDataFrameWriterV2(_jvmObject.Invoke("writeTo", table)); + /// /// Returns a new `DataFrame` by taking the first `number` rows. /// @@ -1049,6 +1057,9 @@ private IEnumerable GetRows(string funcName, params object[] args) private DataFrame WrapAsDataFrame(object obj) => new DataFrame((JvmObjectReference)obj); + private DataFrameWriterV2 WrapAsDataFrameWriterV2(object obj) => + new DataFrameWriterV2((JvmObjectReference)obj); + private Column WrapAsColumn(object obj) => new Column((JvmObjectReference)obj); private RelationalGroupedDataset WrapAsGroupedDataset(object obj) => From f54134861e1bfba1c20862dd09abfaedff784d70 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 18:45:46 -0700 Subject: [PATCH 20/35] Adding DataFrameWriterV2 test file --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 41 +++++++ .../Microsoft.Spark/Sql/DataFrameWriterV2.cs | 103 +++++++++++++++++- 2 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs new file mode 100644 index 000000000..7d365c2bd --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -0,0 +1,41 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.Spark.E2ETest.Utils; +using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests +{ + [Collection("Spark E2E Tests")] + public class DataFrameWriterV2Tests + { + private readonly SparkSession _spark; + + public DataFrameWriterV2Tests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + /// + /// Test signatures for APIs introduced in Spark 3.*. + /// + [SkipIfSparkVersionIsLessThan(Versions.V3_0_0)] + public void TestSignaturesV3_0_X() + { + { + DataFrameWriterV2 dfwV2 = _spark + .Read() + .Schema("age INT, name STRING") + .Json($"{TestEnvironment.ResourceDirectory}people.json") + .WriteTo("testTable"); + + Assert.IsAssignableFrom(dfwV2.Using("json")); + + Assert.IsType(dfwV2.Option("key", "value")); + } + } +} diff --git a/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs b/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs index 99dc0a074..123631513 100644 --- a/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs +++ b/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs @@ -3,7 +3,6 @@ // See the LICENSE file in the project root for more information. using System.Collections.Generic; -using Microsoft.Spark.Interop.Internal.Java.Util; using Microsoft.Spark.Interop.Ipc; namespace Microsoft.Spark.Sql @@ -13,7 +12,7 @@ namespace Microsoft.Spark.Sql /// API. /// [Since(Versions.V3_0_0)] - public sealed class DataFrameWriterV2 : IJvmObjectReferenceProvider + public sealed class DataFrameWriterV2 : IJvmObjectReferenceProvider, CreateTableWriter { private readonly JvmObjectReference _jvmObject; @@ -21,6 +20,17 @@ public sealed class DataFrameWriterV2 : IJvmObjectReferenceProvider JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + /// + /// Specifies a provider for the underlying output data source. Spark's default catalog + /// supports "parquet", "json", etc. + /// + /// Provider name + /// CreateTableWriter instance + public CreateTableWriter Using(string provider) + { + return (CreateTableWriter)_jvmObject.Invoke("using", provider); + } + /// /// Adds an output option for the underlying data source. /// @@ -44,16 +54,48 @@ public DataFrameWriterV2 Options(Dictionary options) return this; } + /// + /// Add a table property. + /// + /// Name of property + /// Value of the property + /// CreateTableWriter instance + public CreateTableWriter TableProperty(string property, string value) + { + return (CreateTableWriter)_jvmObject.Invoke("tableProperty", property, value); + } + + /// + /// Partition the output table created by `create`, `createOrReplace`, or `replace` using + /// the given columns or transforms. + /// + /// Column name to partition on + /// Columns to partition on + /// CreateTableWriter instance + public CreateTableWriter PartitionedBy(Column column, params Column[] columns) + { + return (CreateTableWriter)_jvmObject.Invoke("partitionedBy", column, columns); + } + + /// + /// Create a new table from the contents of the data frame. + /// public void Create() { _jvmObject.Invoke("create"); } + /// + /// Replace an existing table with the contents of the data frame. + /// public void Replace() { _jvmObject.Invoke("replace"); } + /// + /// Create a new table or replace an existing table with the contents of the data frame. + /// public void CreateOrReplace() { _jvmObject.Invoke("createOrReplace"); @@ -71,15 +113,70 @@ public void Append() /// Overwrite rows matching the given filter condition with the contents of the data frame /// in the output table. /// - /// + /// Condition filter to overwrite based on public void Overwrite(Column condition) { _jvmObject.Invoke("overwrite", condition); } + /// + /// Overwrite all partition for which the data frame contains at least one row with the + /// contents of the data frame in the output table. + /// public void OverwritePartitions() { _jvmObject.Invoke("overwritePartitions"); } } + + /// + /// Interface to restrict calls to create and replace operations. + /// + [Since(Versions.V3_0_0)] + public interface CreateTableWriter + { + /// + /// Create a new table from the contents of the data frame. + /// The new table's schema, partition layout, properties, and other configuration will be based + /// on the configuration set on this writer. + /// + public void Create(); + + /// + /// Replace an existing table with the contents of the data frame. + /// The existing table's schema, partition layout, properties, and other configuration will be + /// replaced with the contents of the data frame and the configuration set on this writer. + /// + public void Replace(); + + /// + /// Create a new table or replace an existing table with the contents of the data frame. + /// + public void CreateOrReplace(); + + /// + /// Partition the output table created by `create`, `createOrReplace`, or `replace` using + /// the given columns or transforms. + /// + /// Column name to partition on + /// Columns to partition on + /// CreateTableWriter instance + public CreateTableWriter PartitionedBy(Column column, params Column[] columns); + + /// + /// Specifies a provider for the underlying output data source. Spark's default catalog + /// supports "parquet", "json", etc. + /// + /// Provider string value + /// CreateTableWriter instance + public CreateTableWriter Using(string provider); + + /// + /// Add a table property. + /// + /// Name of property + /// Value of the property + /// CreateTableWriter instance + public CreateTableWriter TableProperty(string property, string value); + } } From 90166357d2033fc60dd065763d338d555d688f12 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 19:51:41 -0700 Subject: [PATCH 21/35] changes --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index 7d365c2bd..1535aeae5 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -5,7 +5,6 @@ using System.Collections.Generic; using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.Sql; -using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests @@ -26,16 +25,35 @@ public DataFrameWriterV2Tests(SparkFixture fixture) [SkipIfSparkVersionIsLessThan(Versions.V3_0_0)] public void TestSignaturesV3_0_X() { - { - DataFrameWriterV2 dfwV2 = _spark - .Read() - .Schema("age INT, name STRING") - .Json($"{TestEnvironment.ResourceDirectory}people.json") - .WriteTo("testTable"); + DataFrame df = _spark + .Read() + .Schema("age INT, name STRING") + .Json($"{TestEnvironment.ResourceDirectory}people.json"); - Assert.IsAssignableFrom(dfwV2.Using("json")); + DataFrameWriterV2 dfwV2 = df.WriteTo("testTable"); - Assert.IsType(dfwV2.Option("key", "value")); + //Assert.IsAssignableFrom(dfwV2.Using("json")); + + Assert.IsType(dfwV2.Option("key", "value")); + + Assert.IsType(dfwV2.Options( + new Dictionary() { { "key", "value" } })); + + //Assert.IsAssignableFrom(dfwV2.TableProperty("prop", "value")); + + //Assert.IsAssignableFrom(dfwV2.PartitionedBy(df.Col("age"))); + + dfwV2.Create(); + + //dfwV2.Replace(); + + //dfwV2.CreateOrReplace(); + + //dfwV2.Append(); + + //dfwV2.Overwrite(df.Col("age")); + + //dfwV2.OverwritePartitions(); } } } From 436a519149e8b829ef2f8cfc81a0f15eb3b85512 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 22 Sep 2020 10:20:10 -0700 Subject: [PATCH 22/35] changes --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 29 +++++--- .../Microsoft.Spark/Sql/DataFrameWriterV2.cs | 74 ++++--------------- 2 files changed, 33 insertions(+), 70 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index 1535aeae5..faf1124e6 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -3,8 +3,10 @@ // See the LICENSE file in the project root for more information. using System.Collections.Generic; +using System.IO; using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests @@ -30,30 +32,39 @@ public void TestSignaturesV3_0_X() .Schema("age INT, name STRING") .Json($"{TestEnvironment.ResourceDirectory}people.json"); - DataFrameWriterV2 dfwV2 = df.WriteTo("testTable"); + DataFrameWriterV2 dfwV2 = df.WriteTo("testtable"); - //Assert.IsAssignableFrom(dfwV2.Using("json")); + Assert.IsType(dfwV2.Using("json")); Assert.IsType(dfwV2.Option("key", "value")); Assert.IsType(dfwV2.Options( new Dictionary() { { "key", "value" } })); - //Assert.IsAssignableFrom(dfwV2.TableProperty("prop", "value")); - - //Assert.IsAssignableFrom(dfwV2.PartitionedBy(df.Col("age"))); + Assert.IsType(dfwV2.TableProperty("prop", "value")); dfwV2.Create(); - //dfwV2.Replace(); + Assert.IsType(dfwV2.PartitionedBy(df.Col("age"))); + + // Testing caveat 1************************************************************* + // Functions Replace() and CreateOrReplace() fail with the following error: + // REPLACE TABLE AS SELECT is only supported with v2 tables. + // This is because Spark 3.0 currently doesn't support file source as provider for + // tables. Issue - https://issues.apache.org/jira/browse/SPARK-28396 + //df2.WriteTo("testTable").Replace(); //dfwV2.CreateOrReplace(); - //dfwV2.Append(); + // ***************************************************************************** + + // Table needs TableCapability such as BATCH_WRITE in order to be able to append to it. + + //dfwV2.Append(); // Table default.testtable does not support append in batch mode. - //dfwV2.Overwrite(df.Col("age")); + //dfwV2.Overwrite(df.Col("age")); //Table default.testtable does not support overwrite by filter in batch mode. - //dfwV2.OverwritePartitions(); + //dfwV2.OverwritePartitions(); //Table default.testtable does not support dynamic overwrite in batch mode. } } } diff --git a/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs b/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs index 123631513..9406b176f 100644 --- a/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs +++ b/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs @@ -12,7 +12,7 @@ namespace Microsoft.Spark.Sql /// API. /// [Since(Versions.V3_0_0)] - public sealed class DataFrameWriterV2 : IJvmObjectReferenceProvider, CreateTableWriter + public sealed class DataFrameWriterV2 : IJvmObjectReferenceProvider { private readonly JvmObjectReference _jvmObject; @@ -25,10 +25,11 @@ public sealed class DataFrameWriterV2 : IJvmObjectReferenceProvider, CreateTable /// supports "parquet", "json", etc. /// /// Provider name - /// CreateTableWriter instance - public CreateTableWriter Using(string provider) + /// This DataFrameWriterV2 object + public DataFrameWriterV2 Using(string provider) { - return (CreateTableWriter)_jvmObject.Invoke("using", provider); + _jvmObject.Invoke("using", provider); + return this; } /// @@ -59,10 +60,11 @@ public DataFrameWriterV2 Options(Dictionary options) /// /// Name of property /// Value of the property - /// CreateTableWriter instance - public CreateTableWriter TableProperty(string property, string value) + /// This DataFrameWriterV2 object + public DataFrameWriterV2 TableProperty(string property, string value) { - return (CreateTableWriter)_jvmObject.Invoke("tableProperty", property, value); + _jvmObject.Invoke("tableProperty", property, value); + return this; } /// @@ -71,10 +73,11 @@ public CreateTableWriter TableProperty(string property, string value) /// /// Column name to partition on /// Columns to partition on - /// CreateTableWriter instance - public CreateTableWriter PartitionedBy(Column column, params Column[] columns) + /// This DataFrameWriterV2 object + public DataFrameWriterV2 PartitionedBy(Column column, params Column[] columns) { - return (CreateTableWriter)_jvmObject.Invoke("partitionedBy", column, columns); + _jvmObject.Invoke("partitionedBy", column, columns); + return this; } /// @@ -128,55 +131,4 @@ public void OverwritePartitions() _jvmObject.Invoke("overwritePartitions"); } } - - /// - /// Interface to restrict calls to create and replace operations. - /// - [Since(Versions.V3_0_0)] - public interface CreateTableWriter - { - /// - /// Create a new table from the contents of the data frame. - /// The new table's schema, partition layout, properties, and other configuration will be based - /// on the configuration set on this writer. - /// - public void Create(); - - /// - /// Replace an existing table with the contents of the data frame. - /// The existing table's schema, partition layout, properties, and other configuration will be - /// replaced with the contents of the data frame and the configuration set on this writer. - /// - public void Replace(); - - /// - /// Create a new table or replace an existing table with the contents of the data frame. - /// - public void CreateOrReplace(); - - /// - /// Partition the output table created by `create`, `createOrReplace`, or `replace` using - /// the given columns or transforms. - /// - /// Column name to partition on - /// Columns to partition on - /// CreateTableWriter instance - public CreateTableWriter PartitionedBy(Column column, params Column[] columns); - - /// - /// Specifies a provider for the underlying output data source. Spark's default catalog - /// supports "parquet", "json", etc. - /// - /// Provider string value - /// CreateTableWriter instance - public CreateTableWriter Using(string provider); - - /// - /// Add a table property. - /// - /// Name of property - /// Value of the property - /// CreateTableWriter instance - public CreateTableWriter TableProperty(string property, string value); - } } From 8f30d95776b0721f2e5f9d45121be9612fc098f6 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 24 Sep 2020 14:44:25 -0700 Subject: [PATCH 23/35] Commenting out tests --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index faf1124e6..abcfe0803 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -47,24 +47,18 @@ public void TestSignaturesV3_0_X() Assert.IsType(dfwV2.PartitionedBy(df.Col("age"))); - // Testing caveat 1************************************************************* - // Functions Replace() and CreateOrReplace() fail with the following error: - // REPLACE TABLE AS SELECT is only supported with v2 tables. + + // Testing caveat ************************************************************* + // The following functions cannot be tested because of lack of support for DataSourceV2. // This is because Spark 3.0 currently doesn't support file source as provider for // tables. Issue - https://issues.apache.org/jira/browse/SPARK-28396 - - //df2.WriteTo("testTable").Replace(); - //dfwV2.CreateOrReplace(); - + // + // 1. DataFrameWriterV2.Replace() + // 2. DataFrameWriterV2.CreateOrReplace() + // 3. DataFrameWriterV2.Append() + // 4. DataFrameWriterV2.Overwrite() + // 5. DataFrameWriterV2.OverwritePartitions() // ***************************************************************************** - - // Table needs TableCapability such as BATCH_WRITE in order to be able to append to it. - - //dfwV2.Append(); // Table default.testtable does not support append in batch mode. - - //dfwV2.Overwrite(df.Col("age")); //Table default.testtable does not support overwrite by filter in batch mode. - - //dfwV2.OverwritePartitions(); //Table default.testtable does not support dynamic overwrite in batch mode. } } } From 684dd901c6e56937a2a630f8e0737e5486b508bd Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 24 Sep 2020 22:16:17 -0700 Subject: [PATCH 24/35] changes --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index abcfe0803..36b766a86 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -43,6 +43,7 @@ public void TestSignaturesV3_0_X() Assert.IsType(dfwV2.TableProperty("prop", "value")); + //_spark.Sql("DROP TABLE IF EXISTS default.testtable"); dfwV2.Create(); Assert.IsType(dfwV2.PartitionedBy(df.Col("age"))); From c76aec741cff709077b01a3f935358438429787c Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 24 Sep 2020 22:29:38 -0700 Subject: [PATCH 25/35] Dropping test table if exists --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index 36b766a86..156bf9af3 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -43,7 +43,7 @@ public void TestSignaturesV3_0_X() Assert.IsType(dfwV2.TableProperty("prop", "value")); - //_spark.Sql("DROP TABLE IF EXISTS default.testtable"); + _spark.Sql("DROP TABLE IF EXISTS default.testtable"); dfwV2.Create(); Assert.IsType(dfwV2.PartitionedBy(df.Col("age"))); From 76b205a5b72c03a610c88a71d639b01e7e29c751 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 25 Sep 2020 13:45:59 -0700 Subject: [PATCH 26/35] PR comments --- src/csharp/Microsoft.Spark/Sql/DataFrame.cs | 5 +- .../Microsoft.Spark/Sql/DataFrameWriterV2.cs | 75 ++++++++++++------- 2 files changed, 48 insertions(+), 32 deletions(-) diff --git a/src/csharp/Microsoft.Spark/Sql/DataFrame.cs b/src/csharp/Microsoft.Spark/Sql/DataFrame.cs index 44fe3c7a1..806ce7ec7 100644 --- a/src/csharp/Microsoft.Spark/Sql/DataFrame.cs +++ b/src/csharp/Microsoft.Spark/Sql/DataFrame.cs @@ -541,7 +541,7 @@ public DataFrame Observe(string name, Column expr, params Column[] exprs) => /// Name of table to write to /// DataFrameWriterV2 object public DataFrameWriterV2 WriteTo(string table) => - WrapAsDataFrameWriterV2(_jvmObject.Invoke("writeTo", table)); + new DataFrameWriterV2((JvmObjectReference)_jvmObject.Invoke("writeTo", table)); /// /// Returns a new `DataFrame` by taking the first `number` rows. @@ -1057,9 +1057,6 @@ private IEnumerable GetRows(string funcName, params object[] args) private DataFrame WrapAsDataFrame(object obj) => new DataFrame((JvmObjectReference)obj); - private DataFrameWriterV2 WrapAsDataFrameWriterV2(object obj) => - new DataFrameWriterV2((JvmObjectReference)obj); - private Column WrapAsColumn(object obj) => new Column((JvmObjectReference)obj); private RelationalGroupedDataset WrapAsGroupedDataset(object obj) => diff --git a/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs b/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs index 9406b176f..fcd4378aa 100644 --- a/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs +++ b/src/csharp/Microsoft.Spark/Sql/DataFrameWriterV2.cs @@ -8,7 +8,7 @@ namespace Microsoft.Spark.Sql { /// - /// Interface used to write a [[org.apache.spark.sql.Dataset]] to external storage using the v2 + /// Interface used to write a to external storage using the v2 /// API. /// [Since(Versions.V3_0_0)] @@ -36,7 +36,7 @@ public DataFrameWriterV2 Using(string provider) /// Adds an output option for the underlying data source. /// /// Name of the option - /// Value of the option + /// string value of the option /// This DataFrameWriterV2 object public DataFrameWriterV2 Option(string key, string value) { @@ -44,6 +44,42 @@ public DataFrameWriterV2 Option(string key, string value) return this; } + /// + /// Adds an output option for the underlying data source. + /// + /// Name of the option + /// bool value of the option + /// This DataFrameWriterV2 object + public DataFrameWriterV2 Option(string key, bool value) + { + _jvmObject.Invoke("option", key, value); + return this; + } + + /// + /// Adds an output option for the underlying data source. + /// + /// Name of the option + /// Long value of the option + /// This DataFrameWriterV2 object + public DataFrameWriterV2 Option(string key, long value) + { + _jvmObject.Invoke("option", key, value); + return this; + } + + /// + /// Adds an output option for the underlying data source. + /// + /// Name of the option + /// Double value of the option + /// This DataFrameWriterV2 object + public DataFrameWriterV2 Option(string key, double value) + { + _jvmObject.Invoke("option", key, value); + return this; + } + /// /// Adds output options for the underlying data source. /// @@ -68,8 +104,9 @@ public DataFrameWriterV2 TableProperty(string property, string value) } /// - /// Partition the output table created by `create`, `createOrReplace`, or `replace` using - /// the given columns or transforms. + /// Partition the output table created by , + /// , or using the given columns or + /// transforms. /// /// Column name to partition on /// Columns to partition on @@ -83,52 +120,34 @@ public DataFrameWriterV2 PartitionedBy(Column column, params Column[] columns) /// /// Create a new table from the contents of the data frame. /// - public void Create() - { - _jvmObject.Invoke("create"); - } + public void Create() => _jvmObject.Invoke("create"); /// /// Replace an existing table with the contents of the data frame. /// - public void Replace() - { - _jvmObject.Invoke("replace"); - } + public void Replace() => _jvmObject.Invoke("replace"); /// /// Create a new table or replace an existing table with the contents of the data frame. /// - public void CreateOrReplace() - { - _jvmObject.Invoke("createOrReplace"); - } + public void CreateOrReplace() => _jvmObject.Invoke("createOrReplace"); /// /// Append the contents of the data frame to the output table. /// - public void Append() - { - _jvmObject.Invoke("append"); - } + public void Append() => _jvmObject.Invoke("append"); /// /// Overwrite rows matching the given filter condition with the contents of the data frame /// in the output table. /// /// Condition filter to overwrite based on - public void Overwrite(Column condition) - { - _jvmObject.Invoke("overwrite", condition); - } + public void Overwrite(Column condition) => _jvmObject.Invoke("overwrite", condition); /// /// Overwrite all partition for which the data frame contains at least one row with the /// contents of the data frame in the output table. /// - public void OverwritePartitions() - { - _jvmObject.Invoke("overwritePartitions"); - } + public void OverwritePartitions() => _jvmObject.Invoke("overwritePartitions"); } } From 419b0840f65bd7184eda0cb8c39d492e1e54f6b4 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 25 Sep 2020 13:50:04 -0700 Subject: [PATCH 27/35] Adding tests for new APIs --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index 156bf9af3..c677a34b7 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -36,7 +36,10 @@ public void TestSignaturesV3_0_X() Assert.IsType(dfwV2.Using("json")); - Assert.IsType(dfwV2.Option("key", "value")); + Assert.IsType(dfwV2.Option("key1", "value")); + Assert.IsType(dfwV2.Option("key2", true)); + Assert.IsType(dfwV2.Option("key3", 1L)); + Assert.IsType(dfwV2.Option("key4", 2D)); Assert.IsType(dfwV2.Options( new Dictionary() { { "key", "value" } })); From b76fb9cd9552afc5dfb113f4b88d564add195135 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 26 Sep 2020 18:28:46 -0700 Subject: [PATCH 28/35] PR comments --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 70 ++++++++++++++++--- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index c677a34b7..5282b06c5 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; using System.Collections.Generic; using System.IO; using Microsoft.Spark.E2ETest.Utils; @@ -51,18 +52,65 @@ public void TestSignaturesV3_0_X() Assert.IsType(dfwV2.PartitionedBy(df.Col("age"))); + // Fails with Exception: + // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported + // with v2 tables. + try + { + dfwV2.Replace(); + } + catch (Exception e) + { + Assert.NotNull(e); + } - // Testing caveat ************************************************************* - // The following functions cannot be tested because of lack of support for DataSourceV2. - // This is because Spark 3.0 currently doesn't support file source as provider for - // tables. Issue - https://issues.apache.org/jira/browse/SPARK-28396 - // - // 1. DataFrameWriterV2.Replace() - // 2. DataFrameWriterV2.CreateOrReplace() - // 3. DataFrameWriterV2.Append() - // 4. DataFrameWriterV2.Overwrite() - // 5. DataFrameWriterV2.OverwritePartitions() - // ***************************************************************************** + // Fails with Exception: + // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported + // with v2 tables. + try + { + dfwV2.CreateOrReplace(); + } + catch (Exception e) + { + Assert.NotNull(e); + } + + // Fails with Exception: + // org.apache.spark.sql.AnalysisException: Table default.testtable does not support + // append in batch mode. + try + { + dfwV2.Append(); + } + catch (Exception e) + { + Assert.NotNull(e); + } + + // Fails with Exception: + // org.apache.spark.sql.AnalysisException: Table default.testtable does not support + // overwrite by filter in batch mode. + try + { + dfwV2.Overwrite(df.Col("age")); + } + catch (Exception e) + { + Assert.NotNull(e); + } + + // Fails with Exception: + // org.apache.spark.sql.AnalysisException: Table default.testtable does not support + // dynamic overwrite in batch mode. + try + { + dfwV2.OverwritePartitions(); + } + catch (Exception e) + { + Assert.NotNull(e); + } } } } From b08b0270e604929cc578a09bb95c91f040f1cf38 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 26 Sep 2020 18:36:08 -0700 Subject: [PATCH 29/35] nit --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index 5282b06c5..cd971b726 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -4,10 +4,8 @@ using System; using System.Collections.Generic; -using System.IO; using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.Sql; -using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests @@ -52,7 +50,7 @@ public void TestSignaturesV3_0_X() Assert.IsType(dfwV2.PartitionedBy(df.Col("age"))); - // Fails with Exception: + // Throws the following exception: // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported // with v2 tables. try @@ -64,7 +62,7 @@ public void TestSignaturesV3_0_X() Assert.NotNull(e); } - // Fails with Exception: + // Throws the following exception: // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported // with v2 tables. try @@ -76,7 +74,7 @@ public void TestSignaturesV3_0_X() Assert.NotNull(e); } - // Fails with Exception: + // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // append in batch mode. try @@ -88,7 +86,7 @@ public void TestSignaturesV3_0_X() Assert.NotNull(e); } - // Fails with Exception: + // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // overwrite by filter in batch mode. try @@ -100,7 +98,7 @@ public void TestSignaturesV3_0_X() Assert.NotNull(e); } - // Fails with Exception: + // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // dynamic overwrite in batch mode. try From c3de5c0734af2f886bb7d57e74cb51c7452c4dc4 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Mon, 28 Sep 2020 17:28:59 -0700 Subject: [PATCH 30/35] PR comments --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 45 +++---------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index cd971b726..88491017f 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -53,62 +53,27 @@ public void TestSignaturesV3_0_X() // Throws the following exception: // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported // with v2 tables. - try - { - dfwV2.Replace(); - } - catch (Exception e) - { - Assert.NotNull(e); - } + Assert.Throws(() => dfwV2.Replace()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported // with v2 tables. - try - { - dfwV2.CreateOrReplace(); - } - catch (Exception e) - { - Assert.NotNull(e); - } + Assert.Throws(() => dfwV2.CreateOrReplace()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // append in batch mode. - try - { - dfwV2.Append(); - } - catch (Exception e) - { - Assert.NotNull(e); - } + Assert.Throws(() => dfwV2.Append()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // overwrite by filter in batch mode. - try - { - dfwV2.Overwrite(df.Col("age")); - } - catch (Exception e) - { - Assert.NotNull(e); - } + Assert.Throws(() => dfwV2.Overwrite(df.Col("age"))); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // dynamic overwrite in batch mode. - try - { - dfwV2.OverwritePartitions(); - } - catch (Exception e) - { - Assert.NotNull(e); - } + Assert.Throws(() => dfwV2.OverwritePartitions()); } } } From 8605a71b3b2e8142187ce359f0c2c43ee97341a6 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Mon, 28 Sep 2020 19:18:31 -0700 Subject: [PATCH 31/35] testing hanging after `OverWritePartitions` call --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index 88491017f..32a358201 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -73,7 +73,7 @@ public void TestSignaturesV3_0_X() // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // dynamic overwrite in batch mode. - Assert.Throws(() => dfwV2.OverwritePartitions()); + //Assert.Throws(() => dfwV2.OverwritePartitions()); } } } From 54034bb1f1aba2703bd44f2d9ff3612f45ac64af Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Mon, 28 Sep 2020 23:22:01 -0700 Subject: [PATCH 32/35] testing --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index 32a358201..88491017f 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -73,7 +73,7 @@ public void TestSignaturesV3_0_X() // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // dynamic overwrite in batch mode. - //Assert.Throws(() => dfwV2.OverwritePartitions()); + Assert.Throws(() => dfwV2.OverwritePartitions()); } } } From da76001146cc4f647636c258996935ea26607cbf Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 30 Sep 2020 12:09:39 -0700 Subject: [PATCH 33/35] testing `OverwritePartitions()` hang --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index 88491017f..f0d927bae 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -73,7 +73,15 @@ public void TestSignaturesV3_0_X() // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // dynamic overwrite in batch mode. - Assert.Throws(() => dfwV2.OverwritePartitions()); + try + { + dfwV2.OverwritePartitions(); + } + catch(Exception e) + { + Assert.NotNull(e); + } + //Assert.Throws(() => dfwV2.OverwritePartitions()); } } } From 31bd1f7a050b6085c354173f57617cc0d498d4d5 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 30 Sep 2020 14:18:35 -0700 Subject: [PATCH 34/35] reverting change --- .../IpcTests/Sql/DataFrameWriterV2Tests.cs | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs index f0d927bae..88491017f 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterV2Tests.cs @@ -73,15 +73,7 @@ public void TestSignaturesV3_0_X() // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // dynamic overwrite in batch mode. - try - { - dfwV2.OverwritePartitions(); - } - catch(Exception e) - { - Assert.NotNull(e); - } - //Assert.Throws(() => dfwV2.OverwritePartitions()); + Assert.Throws(() => dfwV2.OverwritePartitions()); } } } From 260393861cba906511970bfee0ffa0c178c73325 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 1 Oct 2020 13:36:24 -0700 Subject: [PATCH 35/35] PR comments --- src/csharp/Microsoft.Spark/Sql/DataFrame.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/csharp/Microsoft.Spark/Sql/DataFrame.cs b/src/csharp/Microsoft.Spark/Sql/DataFrame.cs index 31ed309f3..c8caab697 100644 --- a/src/csharp/Microsoft.Spark/Sql/DataFrame.cs +++ b/src/csharp/Microsoft.Spark/Sql/DataFrame.cs @@ -540,6 +540,7 @@ public DataFrame Observe(string name, Column expr, params Column[] exprs) => /// /// Name of table to write to /// DataFrameWriterV2 object + [Since(Versions.V3_0_0)] public DataFrameWriterV2 WriteTo(string table) => new DataFrameWriterV2((JvmObjectReference)_jvmObject.Invoke("writeTo", table));