Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions src/MarkItDown/Converters/DocxConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -89,22 +89,19 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf
}
}

private static async Task<string> ExtractTextFromDocxAsync(Stream stream, CancellationToken cancellationToken)
private static Task<string> ExtractTextFromDocxAsync(Stream stream, CancellationToken cancellationToken)
{
var result = new StringBuilder();

await Task.Run(() =>
{
using var wordDocument = WordprocessingDocument.Open(stream, false);
var body = wordDocument.MainDocumentPart?.Document?.Body;
using var wordDocument = WordprocessingDocument.Open(stream, false);
var body = wordDocument.MainDocumentPart?.Document?.Body;

if (body != null)
{
ProcessBodyElements(body, result, cancellationToken);
}
}, cancellationToken);
if (body != null)
{
ProcessBodyElements(body, result, cancellationToken);
}

return result.ToString().Trim();
return Task.FromResult(result.ToString().Trim());
}

private static void ProcessBodyElements(Body body, StringBuilder result, CancellationToken cancellationToken)
Expand Down
78 changes: 36 additions & 42 deletions src/MarkItDown/Converters/PdfConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -271,33 +271,30 @@ private sealed class PdfPigTextExtractor : IPdfTextExtractor
{
public Task<string> ExtractTextAsync(byte[] pdfBytes, CancellationToken cancellationToken)
{
return Task.Run(() =>
{
var builder = new StringBuilder();

using var pdfDocument = PdfDocument.Open(pdfBytes);
var builder = new StringBuilder();

for (var pageNumber = 1; pageNumber <= pdfDocument.NumberOfPages; pageNumber++)
{
cancellationToken.ThrowIfCancellationRequested();
var page = pdfDocument.GetPage(pageNumber);
var pageText = page.Text;
using var pdfDocument = PdfDocument.Open(pdfBytes);

if (string.IsNullOrWhiteSpace(pageText))
{
continue;
}
for (var pageNumber = 1; pageNumber <= pdfDocument.NumberOfPages; pageNumber++)
{
cancellationToken.ThrowIfCancellationRequested();
var page = pdfDocument.GetPage(pageNumber);
var pageText = page.Text;

if (builder.Length > 0)
{
builder.AppendLine("\n---\n");
}
if (string.IsNullOrWhiteSpace(pageText))
{
continue;
}

builder.AppendLine(pageText.Trim());
if (builder.Length > 0)
{
builder.AppendLine("\n---\n");
}

return builder.ToString();
}, cancellationToken);
builder.AppendLine(pageText.Trim());
}

return Task.FromResult(builder.ToString());
}
}

Expand All @@ -322,34 +319,31 @@ public Task<IReadOnlyList<string>> RenderImagesAsync(byte[] pdfBytes, Cancellati
[SupportedOSPlatform("ios")]
private static Task<IReadOnlyList<string>> RenderOnSupportedPlatformsAsync(byte[] pdfBytes, CancellationToken cancellationToken)
{
return Task.Run(() =>
var images = new List<string>();
var options = new RenderOptions
{
var images = new List<string>();
var options = new RenderOptions
{
Dpi = 144,
WithAnnotations = true,
WithAspectRatio = true,
AntiAliasing = PdfAntiAliasing.All,
};
Dpi = 144,
WithAnnotations = true,
WithAspectRatio = true,
AntiAliasing = PdfAntiAliasing.All,
};

#pragma warning disable CA1416
foreach (var bitmap in Conversion.ToImages(pdfBytes, password: null, options))
foreach (var bitmap in Conversion.ToImages(pdfBytes, password: null, options))
{
cancellationToken.ThrowIfCancellationRequested();
using var bmp = bitmap;
using var data = bmp.Encode(SKEncodedImageFormat.Png, quality: 90);
if (data is null)
{
cancellationToken.ThrowIfCancellationRequested();
using var bmp = bitmap;
using var data = bmp.Encode(SKEncodedImageFormat.Png, quality: 90);
if (data is null)
{
continue;
}

images.Add(Convert.ToBase64String(data.Span));
continue;
}

images.Add(Convert.ToBase64String(data.Span));
}
#pragma warning restore CA1416

return (IReadOnlyList<string>)images;
}, cancellationToken);
return Task.FromResult<IReadOnlyList<string>>(images);
}
}
}
29 changes: 13 additions & 16 deletions src/MarkItDown/Converters/PptxConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,31 +91,28 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf
}
}

private static async Task<string> ExtractContentFromPptxAsync(Stream stream, CancellationToken cancellationToken)
private static Task<string> ExtractContentFromPptxAsync(Stream stream, CancellationToken cancellationToken)
{
var result = new StringBuilder();

await Task.Run(() =>
using var presentationDocument = PresentationDocument.Open(stream, false);
var presentationPart = presentationDocument.PresentationPart;

if (presentationPart?.Presentation?.SlideIdList != null)
{
using var presentationDocument = PresentationDocument.Open(stream, false);
var presentationPart = presentationDocument.PresentationPart;
var slideCount = 0;

if (presentationPart?.Presentation?.SlideIdList != null)
foreach (var slideId in presentationPart.Presentation.SlideIdList.Elements<SlideId>())
{
var slideCount = 0;
cancellationToken.ThrowIfCancellationRequested();

foreach (var slideId in presentationPart.Presentation.SlideIdList.Elements<SlideId>())
{
cancellationToken.ThrowIfCancellationRequested();

slideCount++;
var slidePart = (SlidePart)presentationPart.GetPartById(slideId.RelationshipId!);
ProcessSlide(slidePart, slideCount, result);
}
slideCount++;
var slidePart = (SlidePart)presentationPart.GetPartById(slideId.RelationshipId!);
ProcessSlide(slidePart, slideCount, result);
}
}, cancellationToken);
}

return result.ToString().Trim();
return Task.FromResult(result.ToString().Trim());
}

private static void ProcessSlide(SlidePart slidePart, int slideNumber, StringBuilder result)
Expand Down
27 changes: 12 additions & 15 deletions src/MarkItDown/Converters/XlsxConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -89,28 +89,25 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf
}
}

private static async Task<string> ExtractDataFromXlsxAsync(Stream stream, CancellationToken cancellationToken)
private static Task<string> ExtractDataFromXlsxAsync(Stream stream, CancellationToken cancellationToken)
{
var result = new StringBuilder();

await Task.Run(() =>
using var spreadsheetDocument = SpreadsheetDocument.Open(stream, false);
var workbookPart = spreadsheetDocument.WorkbookPart;

if (workbookPart?.Workbook?.Sheets != null)
{
using var spreadsheetDocument = SpreadsheetDocument.Open(stream, false);
var workbookPart = spreadsheetDocument.WorkbookPart;

if (workbookPart?.Workbook?.Sheets != null)
foreach (var sheet in workbookPart.Workbook.Sheets.Elements<Sheet>())
{
foreach (var sheet in workbookPart.Workbook.Sheets.Elements<Sheet>())
{
cancellationToken.ThrowIfCancellationRequested();

var worksheetPart = (WorksheetPart)workbookPart.GetPartById(sheet.Id!);
ProcessWorksheet(worksheetPart, sheet.Name?.Value ?? "Sheet", result, workbookPart);
}
cancellationToken.ThrowIfCancellationRequested();

var worksheetPart = (WorksheetPart)workbookPart.GetPartById(sheet.Id!);
ProcessWorksheet(worksheetPart, sheet.Name?.Value ?? "Sheet", result, workbookPart);
}
}, cancellationToken);
}

return result.ToString().Trim();
return Task.FromResult(result.ToString().Trim());
}

private static void ProcessWorksheet(WorksheetPart worksheetPart, string sheetName, StringBuilder result, WorkbookPart workbookPart)
Expand Down