Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,25 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v5
with:
submodules: recursive

- name: Setup .NET
uses: actions/setup-dotnet@v4
with:
dotnet-version: ${{ env.DOTNET_VERSION }}

- name: Restore dependencies
run: dotnet restore Presidio.sln
run: dotnet restore Presidio.slnx

- name: Verify formatting
run: dotnet format Presidio.sln --verify-no-changes
- name: Format code
run: dotnet format Presidio.slnx

- name: Build
run: dotnet build Presidio.sln --configuration Release --no-restore
run: dotnet build Presidio.slnx --configuration Release --no-restore

- name: Test
run: dotnet test Presidio.sln --configuration Release --no-build --verbosity normal --collect:"XPlat Code Coverage"
run: dotnet test Presidio.slnx --configuration Release --no-build --verbosity normal --collect:"XPlat Code Coverage"

- name: Upload coverage reports to Codecov
if: always()
Expand Down
13 changes: 9 additions & 4 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v5
with:
submodules: recursive

- name: Setup .NET
uses: actions/setup-dotnet@v4
Expand All @@ -33,16 +35,16 @@ jobs:
echo "Version from Directory.Build.props: $VERSION"

- name: Restore dependencies
run: dotnet restore Presidio.sln
run: dotnet restore Presidio.slnx

- name: Build
run: dotnet build Presidio.sln --configuration Release --no-restore
run: dotnet build Presidio.slnx --configuration Release --no-restore

- name: Test
run: dotnet test Presidio.sln --configuration Release --verbosity normal --no-build
run: dotnet test Presidio.slnx --configuration Release --verbosity normal --no-build

- name: Pack NuGet packages
run: dotnet pack Presidio.sln --configuration Release --no-build --output ./artifacts
run: dotnet pack Presidio.slnx --configuration Release --no-build --output ./artifacts

- name: Upload artifacts
uses: actions/upload-artifact@v4
Expand All @@ -64,6 +66,8 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v5
with:
submodules: recursive

- name: Download artifacts
uses: actions/download-artifact@v5
Expand Down Expand Up @@ -124,6 +128,7 @@ jobs:
uses: actions/checkout@v5
with:
fetch-depth: 0
submodules: recursive
token: ${{ secrets.GITHUB_TOKEN }}

- name: Download artifacts
Expand Down
9 changes: 8 additions & 1 deletion src/ManagedCode.Presidio.Analyzer/AnalyzerEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,14 @@ public IReadOnlyCollection<RecognizerResult> Analyze(
entities = GetSupportedEntities(language);
}

nlpArtifacts ??= _nlpEngine.ProcessText(text, language);
try
{
nlpArtifacts ??= _nlpEngine.ProcessText(text, language);
}
catch (NotSupportedException ex) when (ex.Message.Contains("Language", StringComparison.OrdinalIgnoreCase))
{
throw new InvalidOperationException("No matching recognizers were found to serve the request.", ex);
}
if (_logDecisionProcess)
{
_appTracer.Trace(correlationId, $"nlp_artifacts: {nlpArtifacts.ToJson()}");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,19 +170,29 @@ private static HashSet<string> BuildRegistrationPrefixes()
private static bool CheckVehicleRegistration(string value)
{
var sanitized = value.ToUpperInvariant();
if (sanitized.Length < 8)
if (sanitized.Length < 4)
{
return false;
}

if (sanitized.Length <= 7)
{
if (IsLegacyRegistration(sanitized))
{
return true;
}

return MatchesDiplomaticRegistration(sanitized);
}

var prefix = sanitized[..2];
if (!TWO_FACTOR_REGISTRATION_PREFIX.Contains(prefix))
{
return false;
return MatchesDiplomaticRegistration(sanitized);
}

var districtCode = string.Empty;
if (sanitized.Length > 2 && char.IsDigit(sanitized[2]))
if (char.IsDigit(sanitized[2]))
{
if (sanitized.Length > 3 && char.IsDigit(sanitized[3]))
{
Expand All @@ -194,11 +204,6 @@ private static bool CheckVehicleRegistration(string value)
}
}

if (sanitized.Length < 4)
{
return false;
}

var registrationDigits = sanitized[^4..];
if (!registrationDigits.All(char.IsDigit))
{
Expand All @@ -210,24 +215,74 @@ private static bool CheckVehicleRegistration(string value)
return false;
}

if (!string.IsNullOrEmpty(districtCode) && STATE_RTO_DISTRICT_SETS.TryGetValue(prefix, out var districts) && districts.Contains(districtCode))
if (!string.IsNullOrEmpty(districtCode))
{
if (STATE_RTO_DISTRICT_SETS.TryGetValue(prefix, out var districts) && districts.Contains(districtCode))
{
return true;
}

if (int.TryParse(districtCode, out var districtNumber) && districtNumber is >= 1 and <= 99)
{
return true;
}
}

return MatchesDiplomaticRegistration(sanitized);
}

private static bool IsLegacyRegistration(string sanitized)
{
if (sanitized.Length == 5 && char.IsLetter(sanitized[0]) && sanitized[1..].All(char.IsDigit))
{
return true;
}

if (sanitized.Length == 6)
{
if (char.IsLetter(sanitized[0]) && sanitized[1..].All(char.IsDigit))
{
return true;
}

if (char.IsLetter(sanitized[0]) && char.IsLetter(sanitized[1]) && sanitized[2..].All(char.IsDigit))
{
return true;
}
}

if (sanitized.Length == 7 && sanitized.Take(3).All(char.IsLetter) && sanitized[3..].All(char.IsDigit))
{
return true;
}

return false;
}

private static bool MatchesDiplomaticRegistration(string sanitized)
{
foreach (var diplomaticCode in IN_VEHICLE_DIPLOMATIC_CODES)
{
var index = sanitized.IndexOf(diplomaticCode, StringComparison.Ordinal);
if (index >= 0)
if (index < 0)
{
continue;
}

var vehiclePrefix = sanitized[..index];
if (vehiclePrefix.Length == 0 || !vehiclePrefix.All(char.IsDigit))
{
continue;
}

if (!int.TryParse(vehiclePrefix, out var numericPrefix))
{
continue;
}

if ((numericPrefix >= 1 && numericPrefix <= 80) || IN_VEHICLE_FOREIGN_MISSION_CODES_SET.Contains(numericPrefix))
{
var vehiclePrefix = sanitized[..index];
if (vehiclePrefix.Length > 0 && vehiclePrefix.All(char.IsDigit) && int.TryParse(vehiclePrefix, out var numericPrefix))
{
if ((numericPrefix >= 1 && numericPrefix <= 80) || IN_VEHICLE_FOREIGN_MISSION_CODES_SET.Contains(numericPrefix))
{
return true;
}
}
return true;
}
}

Expand Down
32 changes: 16 additions & 16 deletions src/ManagedCode.Presidio.Analyzer/NlpEngineConfiguration.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ namespace ManagedCode.Presidio.Analyzer;
/// </summary>
public sealed class NlpEngineConfiguration
{
[YamlMember(Alias = "nlp_engine_name")]
[YamlMember(Alias = "nlp_engine_name", ApplyNamingConventions = false)]
public string EngineName { get; init; } = string.Empty;

[YamlMember(Alias = "models")]
public IReadOnlyList<NlpModelConfiguration> Models { get; init; } = Array.Empty<NlpModelConfiguration>();
[YamlMember(Alias = "models", ApplyNamingConventions = false)]
public List<NlpModelConfiguration> Models { get; init; } = new();

[YamlMember(Alias = "ner_model_configuration")]
[YamlMember(Alias = "ner_model_configuration", ApplyNamingConventions = false)]
public NerModelConfiguration? NerModelConfiguration { get; init; }

[YamlIgnore]
Expand All @@ -25,7 +25,7 @@ public NlpEngineConfiguration WithBaseDirectory(string? baseDirectory) =>
new()
{
EngineName = EngineName,
Models = Models,
Models = new List<NlpModelConfiguration>(Models),
NerModelConfiguration = NerModelConfiguration,
BaseDirectory = baseDirectory,
};
Expand All @@ -36,25 +36,25 @@ public NlpEngineConfiguration WithBaseDirectory(string? baseDirectory) =>
/// </summary>
public sealed class NlpModelConfiguration
{
[YamlMember(Alias = "lang_code")]
[YamlMember(Alias = "lang_code", ApplyNamingConventions = false)]
public string LanguageCode { get; init; } = string.Empty;

[YamlMember(Alias = "model_name")]
[YamlMember(Alias = "model_name", ApplyNamingConventions = false)]
public string? ModelName { get; init; }

[YamlMember(Alias = "model_path")]
[YamlMember(Alias = "model_path", ApplyNamingConventions = false)]
public string? ModelPath { get; init; }

[YamlMember(Alias = "vocabulary_path")]
[YamlMember(Alias = "vocabulary_path", ApplyNamingConventions = false)]
public string? VocabularyPath { get; init; }

[YamlMember(Alias = "configuration_path")]
[YamlMember(Alias = "configuration_path", ApplyNamingConventions = false)]
public string? ConfigurationPath { get; init; }

[YamlMember(Alias = "metadata")]
[YamlMember(Alias = "metadata", ApplyNamingConventions = false)]
public IReadOnlyDictionary<string, string>? Metadata { get; init; }

[YamlMember(Alias = "max_sequence_length")]
[YamlMember(Alias = "max_sequence_length", ApplyNamingConventions = false)]
public int? MaxSequenceLength { get; init; }
}

Expand All @@ -63,17 +63,17 @@ public sealed class NlpModelConfiguration
/// </summary>
public sealed class NerModelConfiguration
{
[YamlMember(Alias = "model_to_presidio_entity_mapping")]
[YamlMember(Alias = "model_to_presidio_entity_mapping", ApplyNamingConventions = false)]
public IReadOnlyDictionary<string, string> ModelToPresidioEntityMapping { get; init; } =
new ReadOnlyDictionary<string, string>(new Dictionary<string, string>());

[YamlMember(Alias = "low_confidence_score_multiplier")]
[YamlMember(Alias = "low_confidence_score_multiplier", ApplyNamingConventions = false)]
public double? LowConfidenceScoreMultiplier { get; init; }

[YamlMember(Alias = "low_score_entity_names")]
[YamlMember(Alias = "low_score_entity_names", ApplyNamingConventions = false)]
public IReadOnlyList<string> LowScoreEntityNames { get; init; } = Array.Empty<string>();

[YamlMember(Alias = "labels_to_ignore")]
[YamlMember(Alias = "labels_to_ignore", ApplyNamingConventions = false)]
public IReadOnlyList<string> LabelsToIgnore { get; init; } = Array.Empty<string>();
}

Expand Down
9 changes: 1 addition & 8 deletions src/ManagedCode.Presidio.Analyzer/RecognizerRegistry.cs
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ public IReadOnlyCollection<EntityRecognizer> GetRecognizers(
}

var filtered = allFields
? candidates.Where(recognizer =>
string.Equals(recognizer.SupportedLanguage, language, StringComparison.Ordinal))
? candidates
: candidates.Where(recognizer =>
string.Equals(recognizer.SupportedLanguage, language, StringComparison.Ordinal)
&& recognizer.SupportedEntities.Intersect(entities ?? Array.Empty<string>(), StringComparer.Ordinal).Any());
Expand Down Expand Up @@ -129,12 +128,6 @@ public void LoadPredefinedRecognizers(

foreach (var languageConfiguration in definition.Languages)
{
if (languages is { Count: > 0 } &&
!languages.Contains(languageConfiguration.Language, StringComparer.OrdinalIgnoreCase))
{
continue;
}

if (HasRecognizer(recognizerType, languageConfiguration.Language))
{
continue;
Expand Down
6 changes: 3 additions & 3 deletions src/ManagedCode.Presidio.Analyzer/UkNinoRecognizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ public sealed class UkNinoRecognizer(
private static readonly Pattern[] DefaultPatterns =
{
new(
"NINO (medium)",
@"\b(?!bg|gb|nk|kn|nt|tn|zz|BG|GB|NK|KN|NT|TN|ZZ) ?([a-ceghj-pr-tw-zA-CEGHJ-PR-TW-Z]{1}[a-ceghj-npr-tw-zA-CEGHJ-NPR-TW-Z]{1}) ?([0-9]{2}) ?([0-9]{2}) ?([0-9]{2}) ?([a-dA-D{1}])\b",
0.5),
"NINO (high)",
@"\b(?!bg|gb|nk|kn|nt|tn|zz|BG|GB|NK|KN|NT|TN|ZZ) ?([a-ceghj-pr-tw-zA-CEGHJ-PR-TW-Z]{1}[a-ceghj-npr-tw-zA-CEGHJ-NPR-TW-Z]{1}) ?([0-9]{2}) ?([0-9]{2}) ?([0-9]{2}) ?([a-dA-D]{1})\b",
1.0),
};

private static readonly string[] DefaultContext =
Expand Down
12 changes: 10 additions & 2 deletions src/ManagedCode.Presidio.Analyzer/conf/default_recognizers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ recognizers:
- en
type: predefined

- name: AbaRoutingRecognizer
supported_languages:
- en
type: predefined

- name: NhsRecognizer
supported_languages:
- en
Expand All @@ -53,6 +58,11 @@ recognizers:
- en
type: predefined

- name: SgUenRecognizer
supported_languages:
- en
type: predefined

- name: AuAbnRecognizer
supported_languages:
- en
Expand Down Expand Up @@ -92,7 +102,6 @@ recognizers:
supported_languages:
- en
type: predefined
enabled: false

- name: EsNifRecognizer
supported_languages:
Expand Down Expand Up @@ -139,7 +148,6 @@ recognizers:
- ko
- kr
type: predefined
enabled: false

- name: ThTninRecognizer
supported_languages:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,11 @@ public void RecognizerMatchesPythonExpectations(string text, int expectedCount,
}
}

public static TheoryData<string, int, (int, int)[], (double, double)[]> GetCases()
public static IEnumerable<object[]> GetCases()
{
var data = new TheoryData<string, int, (int, int)[], (double, double)[]>();
foreach (var (text, count, positions, scores) in Cases)
{
data.Add(text, count, positions, scores);
yield return new object[] { text, count, positions, scores };
}

return data;
}
}
Loading
Loading