From 97e831c321f2ff8d37fe9eb480fb86396e052188 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Sun, 9 May 2021 12:59:53 -0400 Subject: [PATCH] handle narrow whitespaces in default text extractor #319 where the gap is small but much larger than all previous gaps at this font size (and still larger than some minimum threshold) then break the word at this gap boundary. --- .../Util/DefaultWordExtractor.cs | 39 ++++++++++++++++++- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs b/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs index fcc316b8..be663e3b 100644 --- a/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs +++ b/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs @@ -21,6 +21,8 @@ public IEnumerable GetWords(IReadOnlyList letters) var lettersSoFar = new List(10); + var gapCountsSoFarByFontSize = new Dictionary>(); + var y = default(double?); var lastX = default(double?); var lastLetter = default(Letter); @@ -68,15 +70,48 @@ public IEnumerable GetWords(IReadOnlyList letters) continue; } + var letterHeight = Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height); + var gap = letter.Location.X - (lastLetter.Location.X + lastLetter.Width); var nextToLeft = letter.Location.X < lastX.Value - 1; - var nextBigSpace = gap > Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height) * 0.39; + var nextBigSpace = gap > letterHeight * 0.39; var nextIsWhiteSpace = string.IsNullOrWhiteSpace(letter.Value); var nextFontDiffers = !string.Equals(letter.FontName, lastLetter.FontName, StringComparison.OrdinalIgnoreCase) && gap > letter.Width * 0.1; var nextFontSizeDiffers = Math.Abs(letter.FontSize - lastLetter.FontSize) > 0.1; var nextTextOrientationDiffers = letter.TextOrientation != lastLetter.TextOrientation; - if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers) + var suspectGap = false; + + if (!nextFontSizeDiffers && letter.FontSize > 0 && gap >= 0) + { + var fontSize = Math.Round(letter.FontSize); + if (!gapCountsSoFarByFontSize.TryGetValue(fontSize, out var gapCounts)) + { + gapCounts = new Dictionary(); + gapCountsSoFarByFontSize[fontSize] = gapCounts; + } + + var gapRounded = Math.Round(gap, 2); + if (!gapCounts.ContainsKey(gapRounded)) + { + gapCounts[gapRounded] = 0; + } + + gapCounts[gapRounded]++; + + // More than one type of gap. + if (gapCounts.Count > 1 && gap > letterHeight * 0.16) + { + var mostCommonGap = gapCounts.OrderByDescending(x => x.Value).First(); + + if (gap > (mostCommonGap.Key * 5) && mostCommonGap.Value > 1) + { + suspectGap = true; + } + } + } + + if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers || suspectGap) { if (lettersSoFar.Count > 0) {