Skip to content

Commit

Permalink
benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
russcam committed May 23, 2024
1 parent 6ed3797 commit 188c722
Show file tree
Hide file tree
Showing 12 changed files with 407 additions and 24 deletions.
28 changes: 28 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Benchmark

on:
workflow_dispatch:
inputs:
tests:
type: string
required: true
default: "*English*Detection*"
description: "filter for tests to run"

jobs:
report:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Setup .NET
uses: actions/setup-dotnet@v4
with:
dotnet-version: 8.0.x
- name: Run Benchmarks
run: ./build.sh benchmark --filter ${{ inputs.tests }} --exporters GitHub --job Short
- name: Write Summary
run: |
cat BenchmarkDotNet.Artifacts/results/*github.md >> $GITHUB_STEP_SUMMARY
3 changes: 2 additions & 1 deletion src/Lingua/LanguageDetector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,9 @@ private void PreloadLanguageModels()
using var gzipStream = new GZipStream(stream, CompressionMode.Decompress);
return LanguageModel.FromJson(gzipStream);
}
catch (Exception ex) when (ex is FileNotFoundException or IOException)
catch (FileNotFoundException)
{
// there may not be a model for a given ngram/language
return new Dictionary<string, double>();
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/Lingua/Lingua.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
</GZip>
<ItemGroup>
<Content Remove="@(Content)" />
<Content Include="@(GZipContent)" CopyToOutputDirectory="PreserveNewest">
<Content Include="@(GZipContent)" CopyToOutputDirectory="Always">
<!-- Package gzipped contents -->
<Pack>true</Pack>
<PackagePath>contentFiles/any/any/%(RelativeDir)%(Filename)%(Extension)</PackagePath>
Expand Down
4 changes: 0 additions & 4 deletions tests/Lingua.AccuracyReport.Tests/SupportedLanguages.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@ static SupportedLanguages()
languagesSupportedByLanguageDetection.Add(Nynorsk);
languagesSupportedByLanguageDetection.Add(Bokmal);
}
else
{
var foo = isoCode;
}
}

var languageDetection = languagesSupportedByLanguageDetection.ToArray();
Expand Down
2 changes: 1 addition & 1 deletion tests/Lingua.Benchmarks/Lingua.Benchmarks.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<Nullable>disable</Nullable>
</PropertyGroup>

<ItemGroup>
Expand Down
59 changes: 59 additions & 0 deletions tests/Lingua.Benchmarks/Sentences/EnglishSentencesDetection.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
using BenchmarkDotNet.Attributes;
using NTextCat;

namespace Lingua.Benchmarks.Sentences;

public class EnglishSentencesDetection
{
private LanguageDetector _linguaLanguageDetector;
private LanguageDetector _lowAccuracyLinguaLanguageDetector;
private LanguageDetection.LanguageDetector _languageDetectionLanguageDetector;
private RankedLanguageIdentifier _nTextCatLanguageDetector;

[GlobalSetup]
public void GlobalSetup()
{
var languages = SupportedLanguages.ByAllImplementations;

var detector = new LanguageDetection.LanguageDetector();
detector.AddLanguages(languages.Select(l => l.IsoCode6393().ToString().ToLowerInvariant()).ToArray());
_languageDetectionLanguageDetector = detector;

_linguaLanguageDetector = LanguageDetectorBuilder
.FromLanguages(languages)
.WithPreloadedLanguageModels()
.Build();

_lowAccuracyLinguaLanguageDetector = LanguageDetectorBuilder
.FromLanguages(languages)
.WithPreloadedLanguageModels()
.WithLowAccuracyMode()
.Build();

var factory = new RankedLanguageIdentifierFactory();
using var stream = typeof(Program).Assembly
.GetManifestResourceStream("Lingua.Benchmarks.Core14.profile.xml");
var nTextCatDetector = factory.Load(stream);
_nTextCatLanguageDetector = nTextCatDetector;
}

[Benchmark(Baseline = true)]
public Language LinguaLowAccuracy() => _lowAccuracyLinguaLanguageDetector.DetectLanguageOf(Text);

[Benchmark]
public Language Lingua() => _linguaLanguageDetector.DetectLanguageOf(Text);

[Benchmark]
public string LanguageDetection() => _languageDetectionLanguageDetector.Detect(Text);

[Benchmark]
public Tuple<NTextCat.LanguageInfo, double> NTextCat() => _nTextCatLanguageDetector.Identify(Text).First();

[ParamsSource(nameof(ValuesForText))]
public string Text { get; set; }

public IEnumerable<string> ValuesForText => new[]
{
"On numerous occasions, slight disturbances have caused computers to malfunction and entire industrial facilities to shutdown."
};
}
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Engines;
using BenchmarkDotNet.Attributes;
using NTextCat;

namespace Lingua.Benchmarks;
namespace Lingua.Benchmarks.SingleWord;

public class SingleWordDetection
public class EnglishSingleWordDetection
{
private readonly LanguageDetector _linguaLanguageDetector;
private readonly LanguageDetector _lowAccuracyLinguaLanguageDetector;
private readonly LanguageDetection.LanguageDetector _languageDetectionLanguageDetector;
private readonly RankedLanguageIdentifier _nTextCatLanguageDetector;
private LanguageDetector _linguaLanguageDetector;
private LanguageDetector _lowAccuracyLinguaLanguageDetector;
private LanguageDetection.LanguageDetector _languageDetectionLanguageDetector;
private RankedLanguageIdentifier _nTextCatLanguageDetector;

public SingleWordDetection()
[GlobalSetup]
public void GlobalSetup()
{
var languages = SupportedLanguages.ByAllImplementations;

Expand All @@ -37,19 +37,23 @@ public SingleWordDetection()
_nTextCatLanguageDetector = nTextCatDetector;
}

[Params("ialomiţa", "podĺa", "ґрунтовому", "cằm", "suspiciously")]
// ReSharper disable once UnassignedField.Global
public string? Text;

[Benchmark(Baseline = true)]
public Language LinguaLowAccuracy() => _lowAccuracyLinguaLanguageDetector.DetectLanguageOf(Text!);
public Language LinguaLowAccuracy() => _lowAccuracyLinguaLanguageDetector.DetectLanguageOf(Text);

[Benchmark]
public Language Lingua() => _linguaLanguageDetector.DetectLanguageOf(Text!);
public Language Lingua() => _linguaLanguageDetector.DetectLanguageOf(Text);

[Benchmark]
public string? LanguageDetection() => _languageDetectionLanguageDetector.Detect(Text!);
public string LanguageDetection() => _languageDetectionLanguageDetector.Detect(Text);

[Benchmark]
public Tuple<NTextCat.LanguageInfo, double> NTextCat() => _nTextCatLanguageDetector.Identify(Text!).First();
public Tuple<NTextCat.LanguageInfo, double> NTextCat() => _nTextCatLanguageDetector.Identify(Text).First();

[ParamsSource(nameof(ValuesForText))]
public string Text { get; set; }

public IEnumerable<string> ValuesForText => new[]
{
"suspiciously"
};
}
59 changes: 59 additions & 0 deletions tests/Lingua.Benchmarks/SingleWord/RomanianSingleWordDetection.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
using BenchmarkDotNet.Attributes;
using NTextCat;

namespace Lingua.Benchmarks.SingleWord;

public class RomanianSingleWordDetection
{
private LanguageDetector _linguaLanguageDetector;
private LanguageDetector _lowAccuracyLinguaLanguageDetector;
private LanguageDetection.LanguageDetector _languageDetectionLanguageDetector;
private RankedLanguageIdentifier _nTextCatLanguageDetector;

[GlobalSetup]
public void GlobalSetup()
{
var languages = SupportedLanguages.ByAllImplementations;

var detector = new LanguageDetection.LanguageDetector();
detector.AddLanguages(languages.Select(l => l.IsoCode6393().ToString().ToLowerInvariant()).ToArray());
_languageDetectionLanguageDetector = detector;

_linguaLanguageDetector = LanguageDetectorBuilder
.FromLanguages(languages)
.WithPreloadedLanguageModels()
.Build();

_lowAccuracyLinguaLanguageDetector = LanguageDetectorBuilder
.FromLanguages(languages)
.WithPreloadedLanguageModels()
.WithLowAccuracyMode()
.Build();

var factory = new RankedLanguageIdentifierFactory();
using var stream = typeof(Program).Assembly
.GetManifestResourceStream("Lingua.Benchmarks.Core14.profile.xml");
var nTextCatDetector = factory.Load(stream);
_nTextCatLanguageDetector = nTextCatDetector;
}

[Benchmark(Baseline = true)]
public Language LinguaLowAccuracy() => _lowAccuracyLinguaLanguageDetector.DetectLanguageOf(Text);

[Benchmark]
public Language Lingua() => _linguaLanguageDetector.DetectLanguageOf(Text);

[Benchmark]
public string LanguageDetection() => _languageDetectionLanguageDetector.Detect(Text);

[Benchmark]
public Tuple<NTextCat.LanguageInfo, double> NTextCat() => _nTextCatLanguageDetector.Identify(Text).First();

[ParamsSource(nameof(ValuesForText))]
public string Text { get; set; }

public IEnumerable<string> ValuesForText => new[]
{
"ialomiţa"
};
}
59 changes: 59 additions & 0 deletions tests/Lingua.Benchmarks/SingleWord/SlovakSingleWordDetection.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
using BenchmarkDotNet.Attributes;
using NTextCat;

namespace Lingua.Benchmarks.SingleWord;

public class SlovakSingleWordDetection
{
private LanguageDetector _linguaLanguageDetector;
private LanguageDetector _lowAccuracyLinguaLanguageDetector;
private LanguageDetection.LanguageDetector _languageDetectionLanguageDetector;
private RankedLanguageIdentifier _nTextCatLanguageDetector;

[GlobalSetup]
public void GlobalSetup()
{
var languages = SupportedLanguages.ByAllImplementations;

var detector = new LanguageDetection.LanguageDetector();
detector.AddLanguages(languages.Select(l => l.IsoCode6393().ToString().ToLowerInvariant()).ToArray());
_languageDetectionLanguageDetector = detector;

_linguaLanguageDetector = LanguageDetectorBuilder
.FromLanguages(languages)
.WithPreloadedLanguageModels()
.Build();

_lowAccuracyLinguaLanguageDetector = LanguageDetectorBuilder
.FromLanguages(languages)
.WithPreloadedLanguageModels()
.WithLowAccuracyMode()
.Build();

var factory = new RankedLanguageIdentifierFactory();
using var stream = typeof(Program).Assembly
.GetManifestResourceStream("Lingua.Benchmarks.Core14.profile.xml");
var nTextCatDetector = factory.Load(stream);
_nTextCatLanguageDetector = nTextCatDetector;
}

[Benchmark(Baseline = true)]
public Language LinguaLowAccuracy() => _lowAccuracyLinguaLanguageDetector.DetectLanguageOf(Text);

[Benchmark]
public Language Lingua() => _linguaLanguageDetector.DetectLanguageOf(Text);

[Benchmark]
public string LanguageDetection() => _languageDetectionLanguageDetector.Detect(Text);

[Benchmark]
public Tuple<NTextCat.LanguageInfo, double> NTextCat() => _nTextCatLanguageDetector.Identify(Text).First();

[ParamsSource(nameof(ValuesForText))]
public string Text { get; set; }

public IEnumerable<string> ValuesForText => new[]
{
"podĺa"
};
}
59 changes: 59 additions & 0 deletions tests/Lingua.Benchmarks/SingleWord/UkranianSingleWordDetection.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
using BenchmarkDotNet.Attributes;
using NTextCat;

namespace Lingua.Benchmarks.SingleWord;

public class UkranianSingleWordDetection
{
private LanguageDetector _linguaLanguageDetector;
private LanguageDetector _lowAccuracyLinguaLanguageDetector;
private LanguageDetection.LanguageDetector _languageDetectionLanguageDetector;
private RankedLanguageIdentifier _nTextCatLanguageDetector;

[GlobalSetup]
public void GlobalSetup()
{
var languages = SupportedLanguages.ByAllImplementations;

var detector = new LanguageDetection.LanguageDetector();
detector.AddLanguages(languages.Select(l => l.IsoCode6393().ToString().ToLowerInvariant()).ToArray());
_languageDetectionLanguageDetector = detector;

_linguaLanguageDetector = LanguageDetectorBuilder
.FromLanguages(languages)
.WithPreloadedLanguageModels()
.Build();

_lowAccuracyLinguaLanguageDetector = LanguageDetectorBuilder
.FromLanguages(languages)
.WithPreloadedLanguageModels()
.WithLowAccuracyMode()
.Build();

var factory = new RankedLanguageIdentifierFactory();
using var stream = typeof(Program).Assembly
.GetManifestResourceStream("Lingua.Benchmarks.Core14.profile.xml");
var nTextCatDetector = factory.Load(stream);
_nTextCatLanguageDetector = nTextCatDetector;
}

[Benchmark(Baseline = true)]
public Language LinguaLowAccuracy() => _lowAccuracyLinguaLanguageDetector.DetectLanguageOf(Text);

[Benchmark]
public Language Lingua() => _linguaLanguageDetector.DetectLanguageOf(Text);

[Benchmark]
public string LanguageDetection() => _languageDetectionLanguageDetector.Detect(Text);

[Benchmark]
public Tuple<NTextCat.LanguageInfo, double> NTextCat() => _nTextCatLanguageDetector.Identify(Text).First();

[ParamsSource(nameof(ValuesForText))]
public string Text { get; set; }

public IEnumerable<string> ValuesForText => new[]
{
"ґрунтовому"
};
}
Loading

0 comments on commit 188c722

Please sign in to comment.