From b35489ad02f524989ef83a55b5713aba9e648d89 Mon Sep 17 00:00:00 2001 From: ruih Date: Wed, 7 Aug 2019 11:54:59 -0700 Subject: [PATCH] This commit adds two unit tests which mimic the reducers used in email search scenarios today. They are typical examples as it covers the two most common data transformation --- takes N rows and outputs 1 row, and takes N rows and outputs M rows. We use the two unit tests to demonstrate the feasibility to migrate existing reducers used by email search to HDI. Note this isn't a performance test yet. 1) The first reducer computes the top N contacts per user. It first groups by the user id and then collapses each group into one row by concatenating the records. A Udf is used in order to apply some string operations. 2) The second reducer takes N rows and outputs M rows. Given a succeeded search query with flattened folder and item list, the reducer explodes the input dataframe by pairing the corresponding folder and item ids. If the folder and item list contain multiple records, the reducer will generate multiple rows as output. --- .../Resources/neighbors.json | 6 + .../Resources/search_actions.json | 1 + .../ScenarioTests/EmailSearchTests.cs | 163 ++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 src/csharp/Microsoft.Spark.E2ETest/Resources/neighbors.json create mode 100644 src/csharp/Microsoft.Spark.E2ETest/Resources/search_actions.json create mode 100644 src/csharp/Microsoft.Spark.E2ETest/ScenarioTests/EmailSearchTests.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/Resources/neighbors.json b/src/csharp/Microsoft.Spark.E2ETest/Resources/neighbors.json new file mode 100644 index 000000000..03c8c1587 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/Resources/neighbors.json @@ -0,0 +1,6 @@ +{"Id":"Id1@72F988BF-86F1-41AF-91AB-2D7CD011DB47","DisplayName":"AliceFN AliceLN","GivenName":"AliceFN","Surname":"AliceLN","IMAddress":"sip:alice@microsoft.com","EmailAddress":"alice@microsoft.com","RelevanceScore":1101.0,"puser":"ruih","ptenant":"MSFT"} +{"Id":"Id2@72F988BF-86F1-41AF-91AB-2D7CD011DB47","DisplayName":"BobFN BobLN","GivenName":"BobFN","Surname":"BobLN","IMAddress":"sip:bob@microsoft.com","EmailAddress":"bob@microsoft.com","RelevanceScore":900.0,"puser":"ruih","ptenant":"MSFT"} +{"Id":"Id3@72F988BF-86F1-41AF-91AB-2D7CD011DB47","DisplayName":"CharlieFN CharlieLN","GivenName":"CharlieFN","Surname":"CharlieLN","IMAddress":"sip:charlie@microsoft.com","EmailAddress":"charlie@microsoft.com","RelevanceScore":857.0,"puser":"ruih","ptenant":"MSFT"} +{"Id":"Id4@72F988BF-86F1-41AF-91AB-2D7CD011DB47","DisplayName":"DougFN DougLN","GivenName":"DougFN","Surname":"DougLN","IMAddress":"sip:doug@microsoft.com","EmailAddress":"doug@microsoft.com","RelevanceScore":1101.0,"puser":"rui","ptenant":"MSFT"} +{"Id":"Id5@72F988BF-86F1-41AF-91AB-2D7CD011DB47","DisplayName":"ElvaFN ElvaLN","GivenName":"ElvaFN","Surname":"ElvaLN","IMAddress":"elva@microsoft.com","EmailAddress":"elva@microsoft.com","RelevanceScore":900.0,"puser":"rui","ptenant":"MSFT"} +{"Id":"Id6@72F988BF-86F1-41AF-91AB-2D7CD011DB47","DisplayName":"FrankFN FrankLN","GivenName":"FrankFN","Surname":"FrankLN","IMAddress":"sip:frank@microsoft.com","EmailAddress":"frank@microsoft.com","RelevanceScore":857.0,"puser":"rui","ptenant":"MSFT"} diff --git a/src/csharp/Microsoft.Spark.E2ETest/Resources/search_actions.json b/src/csharp/Microsoft.Spark.E2ETest/Resources/search_actions.json new file mode 100644 index 000000000..ffa502ebf --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/Resources/search_actions.json @@ -0,0 +1 @@ +{"SessionId":"AC6B34DE-9168-4D27-89A5-8DBCB9EB2C3C","ImpressionId":"Imp1","ConversationId":"DD8A6B40-B4C9-426F-8194-895E9053077C","RequestTime":"10/12/2018 7:33:14 PM","ClientLocalTime":"10/12/2018 7:33:14 PM","PartnerName":"officeshared","RequestStartTime":"10/12/2018 7:33:14 PM","ScenarioName":"outlookdesktop","RouteName":"query","FlightInfo":"ABCD;PQR;XYZ","HttpStatus":"200","UserType":"Business","UserIdPUID":"0003000080000BF9","Ring":"MSIT","Query":"face","EntityType":"Message","FolderIdList":"F1;F2;F3","ReferenceIdList":"R1;R2;R3","TitleList":"[NonUrgent] [EXO] [PROD]: GriffinV2 MDM Alert - Low success rate in SDFV2.suggestions.GuidedFormulationEmailEntitiesProvider;RESOLVED: [Incident] [EXO] [PROD]: Native Csg is crashing causing Search AppPool restarts;RE: [Incident] [EXO] [PROD]: MDM Notification - High latency in WW.gbrp123.suggestions","ConversationIdList":"434C5A69DE4A504B87650551E90343C7;DDDC67FB24D1094BB012C4F09118BA64;2B1D7646F91FD5488344D491AB3C671C","ItemIdList":"ItemId1;ItemId2;ItemId3","ItemImmutableIdList":"ItemImmutableId1;ItemImmutableId2;ItemImmutableId3","TimeReceivedList":"10/4/2018 4:51:57 PM;9/27/2018 6:53:15 AM;6/7/2018 8:41:02 PM","EventDetails":"{\"Type\":\"ReplyAll\",\"ReferenceId\":\"784d4c42-460e-4022-ad44-0fc48eae529a.1000.1\",\"EventLocalTime\":\"2018-10-12T19:33:15-07:00\"}","ErrorMessage":null} diff --git a/src/csharp/Microsoft.Spark.E2ETest/ScenarioTests/EmailSearchTests.cs b/src/csharp/Microsoft.Spark.E2ETest/ScenarioTests/EmailSearchTests.cs new file mode 100644 index 000000000..78253fa33 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/ScenarioTests/EmailSearchTests.cs @@ -0,0 +1,163 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.Spark.Sql; +using Xunit; +using static Microsoft.Spark.Sql.Functions; +using Column = Microsoft.Spark.Sql.Column; + +namespace Microsoft.Spark.E2ETest.ScenarioTests +{ + [Collection("Spark E2E Tests")] + public class EmailSearchTests + { + private readonly SparkSession _spark; + + public EmailSearchTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + /// + /// This is a mimic of Email Search Top People Reducer. + /// https://msasg.visualstudio.com/DefaultCollection/Shared%20Data/_search?action=contents&text=TopPeopleReducer&type=code&lp=code-Project&filters=ProjectFilters%7BShared%20Data%7DRepositoryFilters%7BMatrixCompliant%7D&pageSize=25&result=DefaultCollection%2FShared%20Data%2FMatrixCompliant%2FGBmaster%2F%2Fsrc%2FODIN-ML%2FPartner%2FEmailRelevance%2FEmailRelevance%2FEmailRelevanceHelper%2FTopPeopleReducer.cs + /// + [Fact] + public void TestEmailSearchTopNReducerBasics() + { + // Read the sample data. + DataFrame df = _spark + .Read() + .Schema("Id STRING, DisplayName STRING, GivenName STRING, Surname STRING, IMAddress STRING, EmailAddress STRING, RelevanceScore DOUBLE, puser STRING, ptenant STRING") + .Json($"{TestEnvironment.ResourceDirectory}neighbors.json"); + + // Trim the IMAddress column. + Func trimIMAddress = Udf((str) => str.StartsWith("sip:") ? str.Substring(4) : str); + df = df.WithColumn("IMAddress", trimIMAddress(df["IMAddress"])); + + // Reduce + df = df.GroupBy("puser", "ptenant").Agg(CollectList("GivenName").Alias("GivenNames"), + CollectList("Surname").Alias("Surnames"), + CollectList("DisplayName").Alias("DisplayNames"), + CollectList("EmailAddress").Alias("EmailAddresses"), + CollectList("RelevanceScore").Alias("RelevanceScores"), + CollectList("IMAddress").Alias("IMAddresses")); + // Format the output. + df = df.Select(df["puser"], + df["ptenant"], + ConcatWs(";", df["GivenNames"]).Alias("GivenNames"), + ConcatWs(";", df["Surnames"]).Alias("Surnames"), + ConcatWs(";", df["DisplayNames"]).Alias("DisplayNames"), + ConcatWs(";", df["EmailAddresses"]).Alias("EmailAddresses"), + ConcatWs(";", df["RelevanceScores"]).Alias("RelevanceScores"), + ConcatWs(";", df["IMAddresses"]).Alias("IMAddresses")); + + Assert.Equal(2, df.Count()); + foreach (Row row in df.Collect()) + { + string puser = row.GetAs("puser"); + Assert.Equal("MSFT", row.GetAs("ptenant")); + Assert.Equal("1101.0;900.0;857.0", row.GetAs("RelevanceScores")); + switch (puser) + { + case "ruih": + Assert.Equal("AliceFN;BobFN;CharlieFN", row.GetAs("GivenNames")); + Assert.Equal("AliceLN;BobLN;CharlieLN", row.GetAs("Surnames")); + Assert.Equal("AliceFN AliceLN;BobFN BobLN;CharlieFN CharlieLN", row.GetAs("DisplayNames")); + Assert.Equal("alice@microsoft.com;bob@microsoft.com;charlie@microsoft.com", row.GetAs("EmailAddresses")); + Assert.Equal("alice@microsoft.com;bob@microsoft.com;charlie@microsoft.com", row.GetAs("IMAddresses")); + break; + case "rui": + Assert.Equal("DougFN;ElvaFN;FrankFN", row.GetAs("GivenNames")); + Assert.Equal("DougLN;ElvaLN;FrankLN", row.GetAs("Surnames")); + Assert.Equal("DougFN DougLN;ElvaFN ElvaLN;FrankFN FrankLN", row.GetAs("DisplayNames")); + Assert.Equal("doug@microsoft.com;elva@microsoft.com;frank@microsoft.com", row.GetAs("EmailAddresses")); + Assert.Equal("doug@microsoft.com;elva@microsoft.com;frank@microsoft.com", row.GetAs("IMAddresses")); + break; + default: + throw new Exception($"Unexpected age: {puser}."); + } + } + } + + /// + /// This is a mimic of Email Search Success Action Reducer. + /// https://msasg.visualstudio.com/DefaultCollection/Shared%20Data/_git/MatrixCompliant?path=%2Fsrc%2FODIN-ML%2FPartner%2FEmailRelevance%2FImpressionView%2FAnalysisTools%2FReducer%2FSearchActionSuccessReducer.cs&_a=contents&version=GBmaster + /// + [Fact] + public void TestEmailSearchSuccessActionReducerBasics() + { + // Read the sample data. + DataFrame df = _spark.Read().Json($"{TestEnvironment.ResourceDirectory}search_actions.json"); + + // Select the required columns. + df = df.Select("ImpressionId", "ConversationId", "EntityType", "FolderIdList", "ReferenceIdList", "ItemIdList", "ItemImmutableIdList"); + + // Convert columns of concatenated string to array of strings. + Func toStringArrayUdf = Udf((str) => str.Split(';')); + df = df.WithColumn("FolderIdList", toStringArrayUdf(df["FolderIdList"])) + .WithColumn("ReferenceIdList", toStringArrayUdf(df["ReferenceIdList"])) + .WithColumn("ItemIdList", toStringArrayUdf(df["ItemIdList"])) + .WithColumn("ItemImmutableIdList", toStringArrayUdf(df["ItemImmutableIdList"])); + + // Apply the ArrayZip function to combine the i-th element of each array. + df = df.Select(df["ConversationId"], df["ImpressionId"], df["EntityType"], ArraysZip(df["FolderIdList"], df["ReferenceIdList"], df["ItemIdList"], df["ItemImmutableIdList"]).Alias("ConcatedColumn")); + + // Apply the Explode function to split into multiple rows. + df = df.Select(df["ConversationId"], df["ImpressionId"], df["EntityType"], Explode(df["ConcatedColumn"]).Alias("NewColumn")); + + // Create multiple columns. + df = df.WithColumn("FolderId", df["NewColumn"].GetField("FolderIdList")) + .WithColumn("ReferenceId", df["NewColumn"].GetField("ReferenceIdList")) + .WithColumn("ItemId", df["NewColumn"].GetField("ItemIdList")) + .WithColumn("ItemImmutableId", df["NewColumn"].GetField("ItemImmutableIdList")) + .Select("ConversationId", "ImpressionId", "EntityType", "FolderId", "ItemId", "ReferenceId", "ItemImmutableId"); + + // Check the results. + Assert.Equal(3, df.Count()); + int i = 0; + foreach (Row row in df.Collect()) + { + string impressionId = row.GetAs("ImpressionId"); + string conversationId = row.GetAs("ConversationId"); + string entityType = row.GetAs("EntityType"); + Assert.Equal("Imp1", impressionId); + Assert.Equal("DD8A6B40-B4C9-426F-8194-895E9053077C", conversationId); + Assert.Equal("Message", entityType); + string folderId = row.GetAs("FolderId"); + string itemId = row.GetAs("ItemId"); + string referenceId = row.GetAs("ReferenceId"); + string itemImmutableId = row.GetAs("ItemImmutableId"); + if (i == 0) + { + Assert.Equal("F1", folderId); + Assert.Equal("ItemId1", itemId); + Assert.Equal("R1", referenceId); + Assert.Equal("ItemImmutableId1", itemImmutableId); + } + else if (i == 1) + { + Assert.Equal("F2", folderId); + Assert.Equal("ItemId2", itemId); + Assert.Equal("R2", referenceId); + Assert.Equal("ItemImmutableId2", itemImmutableId); + } + else if (i == 2) + { + Assert.Equal("F3", folderId); + Assert.Equal("ItemId3", itemId); + Assert.Equal("R3", referenceId); + Assert.Equal("ItemImmutableId3", itemImmutableId); + } + else + { + throw new Exception(string.Format("Unexpected row: ConversationId={0}, ImpressionId={1}", conversationId, impressionId)); + } + + i++; + } + } + } +}