diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index 48c7fdf89..11037bc6d 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -3,6 +3,8 @@ // See the LICENSE file in the project root for more information. using System.Collections.Generic; +using System.IO; +using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; using Xunit; @@ -47,6 +49,15 @@ public void TestBucketizer() Assert.Equal(expectedInputCol, bucketizer.GetInputCol()); Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol()); Assert.Equal(expectedSplits, bucketizer.GetSplits()); + + using (var tempDirectory = new TemporaryDirectory()) + { + string savePath = Path.Join(tempDirectory.Path, "bucket"); + bucketizer.Save(savePath); + + Bucketizer loadedBucketizer = Bucketizer.Load(savePath); + Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid()); + } } [Fact] diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs new file mode 100644 index 000000000..7b6882bea --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -0,0 +1,65 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Microsoft.Spark.E2ETest.Utils; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class HashingTFTests + { + private readonly SparkSession _spark; + + public HashingTFTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestHashingTF() + { + string expectedInputCol = "input_col"; + string expectedOutputCol = "output_col"; + int expectedFeatures = 10; + + Assert.IsType(new HashingTF()); + + HashingTF hashingTf = new HashingTF("my-unique-id") + .SetNumFeatures(expectedFeatures) + .SetInputCol(expectedInputCol) + .SetOutputCol(expectedOutputCol); + + Assert.Equal(expectedFeatures, hashingTf.GetNumFeatures()); + Assert.Equal(expectedInputCol, hashingTf.GetInputCol()); + Assert.Equal(expectedOutputCol, hashingTf.GetOutputCol()); + + DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + + " as input_col"); + + DataFrame output = hashingTf.Transform(input); + DataFrame outputVector = output.Select(expectedOutputCol); + + Assert.Contains(expectedOutputCol, outputVector.Columns()); + + using (var tempDirectory = new TemporaryDirectory()) + { + string savePath = Path.Join(tempDirectory.Path, "hashingTF"); + hashingTf.Save(savePath); + + HashingTF loadedHashingTf = HashingTF.Load(savePath); + Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid()); + } + + hashingTf.SetBinary(true); + Assert.True(hashingTf.GetBinary()); + } + } +} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs new file mode 100644 index 000000000..623b7322c --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -0,0 +1,70 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; +using Microsoft.Spark.E2ETest.Utils; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class IDFModelTests + { + private readonly SparkSession _spark; + + public IDFModelTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestIDFModel() + { + int expectedDocFrequency = 1980; + string expectedInputCol = "rawFeatures"; + string expectedOutputCol = "features"; + + DataFrame sentenceData = + _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); + + Tokenizer tokenizer = new Tokenizer() + .SetInputCol("sentence") + .SetOutputCol("words"); + + DataFrame wordsData = tokenizer.Transform(sentenceData); + + HashingTF hashingTF = new HashingTF() + .SetInputCol("words") + .SetOutputCol(expectedInputCol) + .SetNumFeatures(20); + + DataFrame featurizedData = hashingTF.Transform(wordsData); + + IDF idf = new IDF() + .SetInputCol(expectedInputCol) + .SetOutputCol(expectedOutputCol) + .SetMinDocFreq(expectedDocFrequency); + + IDFModel idfModel = idf.Fit(featurizedData); + + DataFrame rescaledData = idfModel.Transform(featurizedData); + Assert.Contains(expectedOutputCol, rescaledData.Columns()); + + Assert.Equal(expectedInputCol, idfModel.GetInputCol()); + Assert.Equal(expectedOutputCol, idfModel.GetOutputCol()); + Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq()); + + using (var tempDirectory = new TemporaryDirectory()) + { + string modelPath = Path.Join(tempDirectory.Path, "idfModel"); + idfModel.Save(modelPath); + + IDFModel loadedModel = IDFModel.Load(modelPath); + Assert.Equal(idfModel.Uid(), loadedModel.Uid()); + } + } + } +} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs new file mode 100644 index 000000000..3dea63de7 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs @@ -0,0 +1,49 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; +using Microsoft.Spark.E2ETest.Utils; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class IDFTests + { + private readonly SparkSession _spark; + + public IDFTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestIDFModel() + { + string expectedInputCol = "rawFeatures"; + string expectedOutputCol = "features"; + int expectedDocFrequency = 100; + + IDF idf = new IDF() + .SetInputCol(expectedInputCol) + .SetOutputCol(expectedOutputCol) + .SetMinDocFreq(expectedDocFrequency); + + Assert.Equal(expectedInputCol, idf.GetInputCol()); + Assert.Equal(expectedOutputCol, idf.GetOutputCol()); + Assert.Equal(expectedDocFrequency, idf.GetMinDocFreq()); + + using (var tempDirectory = new TemporaryDirectory()) + { + string savePath = Path.Join(tempDirectory.Path, "IDF"); + idf.Save(savePath); + + IDF loadedIdf = IDF.Load(savePath); + Assert.Equal(idf.Uid(), loadedIdf.Uid()); + } + } + } +} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs new file mode 100644 index 000000000..8cdb4e03a --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs @@ -0,0 +1,55 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; +using Microsoft.Spark.E2ETest.Utils; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class TokenizerTests + { + private readonly SparkSession _spark; + + public TokenizerTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestTokenizer() + { + string expectedUid = "theUid"; + string expectedInputCol = "input_col"; + string expectedOutputCol = "output_col"; + + DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + + " from range(100)"); + + Tokenizer tokenizer = new Tokenizer(expectedUid) + .SetInputCol(expectedInputCol) + .SetOutputCol(expectedOutputCol); + + DataFrame output = tokenizer.Transform(input); + + Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol)); + Assert.Equal(expectedInputCol, tokenizer.GetInputCol()); + Assert.Equal(expectedOutputCol, tokenizer.GetOutputCol()); + + using (var tempDirectory = new TemporaryDirectory()) + { + string savePath = Path.Join(tempDirectory.Path, "Tokenizer"); + tokenizer.Save(savePath); + + Tokenizer loadedTokenizer = Tokenizer.Load(savePath); + Assert.Equal(tokenizer.Uid(), loadedTokenizer.Uid()); + } + + Assert.Equal(expectedUid, tokenizer.Uid()); + } + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 4eefd119d..823f13c1a 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -8,7 +8,6 @@ using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Sql; -using Microsoft.Spark.Sql.Types; namespace Microsoft.Spark.ML.Feature { @@ -23,18 +22,17 @@ namespace Microsoft.Spark.ML.Feature /// public class Bucketizer : IJvmObjectReferenceProvider { - internal Bucketizer(JvmObjectReference jvmObject) - { - _jvmObject = jvmObject; - } + private static readonly string s_bucketizerClassName = + "org.apache.spark.ml.feature.Bucketizer"; + private readonly JvmObjectReference _jvmObject; + /// /// Create a without any parameters /// public Bucketizer() { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.Bucketizer"); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_bucketizerClassName); } /// @@ -44,11 +42,14 @@ public Bucketizer() /// An immutable unique ID for the object and its derivatives. public Bucketizer(string uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.Bucketizer", uid); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_bucketizerClassName, uid); } - private readonly JvmObjectReference _jvmObject; + internal Bucketizer(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// @@ -70,7 +71,7 @@ public double[] GetSplits() /// bucket, which also includes y. The splits should be of length >= 3 and strictly /// increasing. Values outside the splits specified will be treated as errors. /// - /// + /// New object public Bucketizer SetSplits(double[] value) { return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value)); @@ -95,7 +96,7 @@ public double[][] GetSplitsArray() /// by splits x,y holds values in the range [x,y) except the last bucket, which also /// includes y. The splits should be of length >= 3 and strictly increasing. /// Values outside the splits specified will be treated as errors. - /// + /// New object public Bucketizer SetSplitsArray(double[][] value) { return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", (object)value)); @@ -116,7 +117,7 @@ public string GetInputCol() /// buckets /// /// The name of the column to as the source of the buckets - /// + /// New object public Bucketizer SetInputCol(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value)); @@ -140,7 +141,7 @@ public IEnumerable GetInputCols() /// sets of buckets and two output columns. /// /// List of input columns to use as sources for buckets - /// + /// New object public Bucketizer SetInputCols(IEnumerable value) { return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value)); @@ -161,7 +162,7 @@ public string GetOutputCol() /// name of the new column. /// /// The name of the new column which contains the bucket ID - /// + /// New object public Bucketizer SetOutputCol(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value)); @@ -181,36 +182,47 @@ public IEnumerable GetOutputCols() /// The list of columns that the will create in the DataFrame. /// /// List of column names which will contain the bucket ID - /// + /// New object public Bucketizer SetOutputCols(List value) { return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value)); } + + /// + /// Loads the that was previously saved using Save + /// + /// The path the previous was saved to + /// New object + public static Bucketizer Load(string path) + { + return WrapAsBucketizer( + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_bucketizerClassName,"load", path)); + } + + /// + /// Saves the so that it can be loaded later using Load + /// + /// The path to save the to + /// New object + public Bucketizer Save(string path) + { + return WrapAsBucketizer(_jvmObject.Invoke("save", path)); + } /// /// Executes the and transforms the DataFrame to include the new /// column or columns with the bucketed data. /// /// The DataFrame to add the bucketed data to - /// containing the original data and the new bucketed - /// columns + /// + /// containing the original data and the new bucketed columns + /// public DataFrame Transform(DataFrame source) { return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); } - /// - /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet - /// - /// - /// The to convert into a dotnet - /// - /// - private static Bucketizer WrapAsBucketizer(object obj) - { - return new Bucketizer((JvmObjectReference)obj); - } - /// /// The uid that was used to create the . If no UID is passed in /// when creating the then a random UID is created when the @@ -238,10 +250,13 @@ public string GetHandleInvalid() /// Choices are "skip", "error" or "keep". Default is "error" /// /// "skip", "error" or "keep" - /// + /// New object public Bucketizer SetHandleInvalid(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString())); } + + private static Bucketizer WrapAsBucketizer(object obj) => + new Bucketizer((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs new file mode 100644 index 000000000..50b4fe04a --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs @@ -0,0 +1,184 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; +using Microsoft.Spark.Sql.Types; + +namespace Microsoft.Spark.ML.Feature +{ + /// + /// A Maps a sequence of terms to their term frequencies using the + /// hashing trick. Currently we use Austin Appleby's MurmurHash 3 algorithm + /// (MurmurHash3_x86_32) to calculate the hash code value for the term object. Since a simple + /// modulo is used to transform the hash function to a column index, it is advisable to use a + /// power of two as the numFeatures parameter; otherwise the features will not be mapped evenly + /// to the columns. + /// + public class HashingTF : IJvmObjectReferenceProvider + { + private static readonly string s_hashingTfClassName = + "org.apache.spark.ml.feature.HashingTF"; + + private readonly JvmObjectReference _jvmObject; + + /// + /// Create a without any parameters + /// + public HashingTF() + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_hashingTfClassName); + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// unique identifier + /// + public HashingTF(string uid) + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_hashingTfClassName, uid); + } + + internal HashingTF(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Loads the that was previously saved using Save + /// + /// The path the previous was saved to + /// New object + public static HashingTF Load(string path) + { + return WrapAsHashingTF( + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_hashingTfClassName, "load", path)); + } + + /// + /// Saves the so that it can be loaded later using Load + /// + /// The path to save the to + /// New object + public HashingTF Save(string path) + { + return WrapAsHashingTF(_jvmObject.Invoke("save", path)); + } + + /// + /// Gets the binary toggle that controls term frequency counts + /// + /// Flag showing whether the binary toggle is on or off + public bool GetBinary() + { + return (bool)_jvmObject.Invoke("getBinary"); + } + + /// + /// Binary toggle to control term frequency counts. + /// If true, all non-zero counts are set to 1. This is useful for discrete probabilistic + /// models that model binary events rather than integer counts + /// + /// binary toggle, default is false + public HashingTF SetBinary(bool value) + { + return WrapAsHashingTF(_jvmObject.Invoke("setBinary", value)); + } + + /// + /// Gets the column that the should read from + /// + /// string, the name of the input column + public string GetInputCol() + { + return (string)_jvmObject.Invoke("getInputCol"); + } + + /// + /// Sets the column that the should read from + /// + /// The name of the column to as the source + /// New object + public HashingTF SetInputCol(string value) + { + return WrapAsHashingTF(_jvmObject.Invoke("setInputCol", value)); + } + + /// + /// The will create a new column in the , + /// this is the name of the new column. + /// + /// string, the name of the output col + public string GetOutputCol() + { + return (string)_jvmObject.Invoke("getOutputCol"); + } + + /// + /// The will create a new column in the , + /// this is the name of the new column. + /// + /// The name of the new column + /// New object + public HashingTF SetOutputCol(string value) + { + return WrapAsHashingTF(_jvmObject.Invoke("setOutputCol", value)); + } + + /// + /// Gets the number of features that should be used. Since a simple modulo is used to + /// transform the hash function to a column index, it is advisable to use a power of two + /// as the numFeatures parameter; otherwise the features will not be mapped evenly to the + /// columns. + /// + /// The number of features to be used + public int GetNumFeatures() + { + return (int)_jvmObject.Invoke("getNumFeatures"); + } + + /// + /// Sets the number of features that should be used. Since a simple modulo is used to + /// transform the hash function to a column index, it is advisable to use a power of two as + /// the numFeatures parameter; otherwise the features will not be mapped evenly to the + /// columns. + /// + /// int + /// New object + public HashingTF SetNumFeatures(int value) + { + return WrapAsHashingTF(_jvmObject.Invoke("setNumFeatures", value)); + } + + /// + /// An immutable unique ID for the object and its derivatives. + /// + /// string, unique ID for the object + public string Uid() + { + return (string)_jvmObject.Invoke("uid"); + } + + /// + /// Executes the and transforms the DataFrame to include the new + /// column or columns with the tokens. + /// + /// The to add the tokens to + /// containing the original data and the tokens + public DataFrame Transform(DataFrame source) + { + return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); + } + + private static HashingTF WrapAsHashingTF(object obj) => + new HashingTF((JvmObjectReference)obj); + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs new file mode 100644 index 000000000..5c2259aaf --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs @@ -0,0 +1,154 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; + +namespace Microsoft.Spark.ML.Feature +{ + /// + /// Inverse document frequency (IDF). The standard formulation is used: + /// idf = log((m + 1) / (d(t) + 1)), where m is the total number of documents and d(t) is + /// the number of documents that contain term t. + /// + /// This implementation supports filtering out terms which do not appear in a minimum number + /// of documents (controlled by the variable minDocFreq). For terms that are not in at least + /// minDocFreq documents, the IDF is found as 0, resulting in TF-IDFs of 0. + /// + public class IDF : IJvmObjectReferenceProvider + { + private static readonly string s_IDFClassName = "org.apache.spark.ml.feature.IDF"; + + private readonly JvmObjectReference _jvmObject; + + /// + /// Create a without any parameters + /// + public IDF() + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFClassName); + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + public IDF(string uid) + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFClassName, uid); + } + + internal IDF(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Gets the column that the should read from + /// + /// string, input column + public string GetInputCol() + { + return (string)(_jvmObject.Invoke("getInputCol")); + } + + /// + /// Sets the column that the should read from + /// + /// The name of the column to as the source + /// New object + public IDF SetInputCol(string value) + { + return WrapAsIDF(_jvmObject.Invoke("setInputCol", value)); + } + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// string, the output column + public string GetOutputCol() + { + return (string)(_jvmObject.Invoke("getOutputCol")); + } + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// The name of the new column + /// New object + public IDF SetOutputCol(string value) + { + return WrapAsIDF(_jvmObject.Invoke("setOutputCol", value)); + } + + /// + /// Minimum of documents in which a term should appear for filtering + /// + /// int, minimum number of documents in which a term should appear + public int GetMinDocFreq() + { + return (int)_jvmObject.Invoke("getMinDocFreq"); + } + + /// + /// Minimum of documents in which a term should appear for filtering + /// + /// int, the minimum of documents a term should appear in + /// New object + public IDF SetMinDocFreq(int value) + { + return WrapAsIDF(_jvmObject.Invoke("setMinDocFreq", value)); + } + + /// + /// Fits a model to the input data. + /// + /// The to fit the model to + /// New object + public IDFModel Fit(DataFrame source) + { + return new IDFModel((JvmObjectReference)_jvmObject.Invoke("fit", source)); + } + + /// + /// The uid that was used to create the . If no UID is passed in + /// when creating the then a random UID is created when the + /// is created. + /// + /// string UID identifying the + public string Uid() + { + return (string)_jvmObject.Invoke("uid"); + } + + /// + /// Loads the that was previously saved using Save + /// + /// The path the previous was saved to + /// New object, loaded from path + public static IDF Load(string path) + { + return WrapAsIDF( + SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_IDFClassName, "load", path)); + } + + /// + /// Saves the so that it can be loaded later using Load + /// + /// The path to save the to + /// New object + public IDF Save(string path) + { + return WrapAsIDF(_jvmObject.Invoke("save", path)); + } + + private static IDF WrapAsIDF(object obj) => new IDF((JvmObjectReference)obj); + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs new file mode 100644 index 000000000..4fc8a4f30 --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs @@ -0,0 +1,145 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; + +namespace Microsoft.Spark.ML.Feature +{ + /// + /// A that converts the input string to lowercase and then splits it by + /// white spaces. + /// + public class IDFModel : IJvmObjectReferenceProvider + { + private static readonly string s_IDFModelClassName = + "org.apache.spark.ml.feature.IDFModel"; + + private readonly JvmObjectReference _jvmObject; + + /// + /// Create a without any parameters + /// + public IDFModel() + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFModelClassName); + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + public IDFModel(string uid) + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFModelClassName, uid); + } + + internal IDFModel(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Gets the column that the should read from + /// + /// string, input column + public string GetInputCol() + { + return (string)(_jvmObject.Invoke("getInputCol")); + } + + /// + /// Sets the column that the should read from and convert into + /// buckets + /// + /// The name of the column to as the source + /// New object + public IDFModel SetInputCol(string value) + { + return WrapAsIDFModel(_jvmObject.Invoke("setInputCol", value)); + } + + /// + /// The will create a new column in the , + /// this is the name of the new column. + /// + /// string, the output column + public string GetOutputCol() + { + return (string)(_jvmObject.Invoke("getOutputCol")); + } + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// The name of the new column which contains the tokens + /// + /// New object + public IDFModel SetOutputCol(string value) + { + return WrapAsIDFModel(_jvmObject.Invoke("setOutputCol", value)); + } + + /// + /// Minimum of documents in which a term should appear for filtering + /// + /// Minimum number of documents a term should appear + public int GetMinDocFreq() + { + return (int)_jvmObject.Invoke("getMinDocFreq"); + } + + /// + /// Executes the and transforms the to + /// include the new column or columns with the tokens. + /// + /// The to add the tokens to + /// containing the original data and the tokens + public DataFrame Transform(DataFrame source) + { + return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); + } + + /// + /// The uid that was used to create the . If no UID is passed in + /// when creating the then a random UID is created when the + /// is created. + /// + /// string UID identifying the + public string Uid() + { + return (string)_jvmObject.Invoke("uid"); + } + + /// + /// Loads the that was previously saved using Save + /// + /// The path the previous was saved to + /// New object, loaded from path + public static IDFModel Load(string path) + { + return WrapAsIDFModel( + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_IDFModelClassName, "load", path)); + } + + /// + /// Saves the so that it can be loaded later using Load + /// + /// The path to save the to + /// New object + public IDFModel Save(string path) + { + return WrapAsIDFModel(_jvmObject.Invoke("save", path)); + } + + private static IDFModel WrapAsIDFModel(object obj) => + new IDFModel((JvmObjectReference)obj); + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs new file mode 100644 index 000000000..c411309dc --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs @@ -0,0 +1,136 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; + +namespace Microsoft.Spark.ML.Feature +{ + /// + /// A that converts the input string to lowercase and then splits it by + /// white spaces. + /// + public class Tokenizer : IJvmObjectReferenceProvider + { + private static readonly string s_tokenizerClassName = + "org.apache.spark.ml.feature.Tokenizer"; + + private readonly JvmObjectReference _jvmObject; + + /// + /// Create a without any parameters + /// + public Tokenizer() + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_tokenizerClassName); + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + public Tokenizer(string uid) + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_tokenizerClassName, uid); + } + + internal Tokenizer(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Gets the column that the should read from + /// + /// string, input column + public string GetInputCol() + { + return (string)(_jvmObject.Invoke("getInputCol")); + } + + /// + /// Sets the column that the should read from + /// + /// The name of the column to as the source + /// New object + public Tokenizer SetInputCol(string value) + { + return WrapAsTokenizer(_jvmObject.Invoke("setInputCol", value)); + } + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// string, the output column + public string GetOutputCol() + { + return (string)(_jvmObject.Invoke("getOutputCol")); + } + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// The name of the new column + /// New object + public Tokenizer SetOutputCol(string value) + { + return WrapAsTokenizer(_jvmObject.Invoke("setOutputCol", value)); + } + + /// + /// Executes the and transforms the DataFrame to include the new + /// column + /// + /// The DataFrame to transform + /// + /// New object with the source transformed + /// + public DataFrame Transform(DataFrame source) + { + return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); + } + + /// + /// The uid that was used to create the . If no UID is passed in + /// when creating the then a random UID is created when the + /// is created. + /// + /// string UID identifying the + public string Uid() + { + return (string)_jvmObject.Invoke("uid"); + } + + /// + /// Loads the that was previously saved using Save + /// + /// The path the previous was saved to + /// New object, loaded from path + public static Tokenizer Load(string path) + { + return WrapAsTokenizer( + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_tokenizerClassName, "load", path)); + } + + /// + /// Saves the so that it can be loaded later using Load + /// + /// The path to save the to + /// New object + public Tokenizer Save(string path) + { + return WrapAsTokenizer(_jvmObject.Invoke("save", path)); + } + + private static Tokenizer WrapAsTokenizer(object obj) => + new Tokenizer((JvmObjectReference)obj); + } +} diff --git a/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs b/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs index 3bf4173c3..4de1f4ded 100644 --- a/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs +++ b/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs @@ -160,6 +160,14 @@ internal static DataType ParseDataType(JToken json) } else if (typeName == "udt") { + if (typeJObject.TryGetValue("class", out JToken classToken)) + { + if (typeJObject.TryGetValue("sqlType", out JToken sqlTypeToken)) + { + return new StructType((JObject)sqlTypeToken); + } + } + throw new NotImplementedException(); } }