Skip to content

Commit

Permalink
handle narrow whitespaces in default text extractor #319
Browse files Browse the repository at this point in the history
where the gap is small but much larger than all previous gaps at this
font size (and still larger than some minimum threshold) then break
the word at this gap boundary.
  • Loading branch information
EliotJones committed May 9, 2021
1 parent 264cf7b commit 97e831c
Showing 1 changed file with 37 additions and 2 deletions.
39 changes: 37 additions & 2 deletions src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)

var lettersSoFar = new List<Letter>(10);

var gapCountsSoFarByFontSize = new Dictionary<double, Dictionary<double, int>>();

var y = default(double?);
var lastX = default(double?);
var lastLetter = default(Letter);
Expand Down Expand Up @@ -68,15 +70,48 @@ public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
continue;
}

var letterHeight = Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height);

var gap = letter.Location.X - (lastLetter.Location.X + lastLetter.Width);
var nextToLeft = letter.Location.X < lastX.Value - 1;
var nextBigSpace = gap > Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height) * 0.39;
var nextBigSpace = gap > letterHeight * 0.39;
var nextIsWhiteSpace = string.IsNullOrWhiteSpace(letter.Value);
var nextFontDiffers = !string.Equals(letter.FontName, lastLetter.FontName, StringComparison.OrdinalIgnoreCase) && gap > letter.Width * 0.1;
var nextFontSizeDiffers = Math.Abs(letter.FontSize - lastLetter.FontSize) > 0.1;
var nextTextOrientationDiffers = letter.TextOrientation != lastLetter.TextOrientation;

if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers)
var suspectGap = false;

if (!nextFontSizeDiffers && letter.FontSize > 0 && gap >= 0)
{
var fontSize = Math.Round(letter.FontSize);
if (!gapCountsSoFarByFontSize.TryGetValue(fontSize, out var gapCounts))
{
gapCounts = new Dictionary<double, int>();
gapCountsSoFarByFontSize[fontSize] = gapCounts;
}

var gapRounded = Math.Round(gap, 2);
if (!gapCounts.ContainsKey(gapRounded))
{
gapCounts[gapRounded] = 0;
}

gapCounts[gapRounded]++;

// More than one type of gap.
if (gapCounts.Count > 1 && gap > letterHeight * 0.16)
{
var mostCommonGap = gapCounts.OrderByDescending(x => x.Value).First();

if (gap > (mostCommonGap.Key * 5) && mostCommonGap.Value > 1)
{
suspectGap = true;
}
}
}

if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers || suspectGap)
{
if (lettersSoFar.Count > 0)
{
Expand Down

0 comments on commit 97e831c

Please sign in to comment.