diff --git a/README.md b/README.md index 01d7bb5ff..9a437c476 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ A modern C#/.NET library for converting a wide range of document formats (HTML, | **JSON** | `.json`, `.jsonl`, `.ndjson` | ✅ Supported | Structured JSON data with formatting | | **XML** | `.xml`, `.xsd`, `.xsl`, `.rss`, `.atom` | ✅ Supported | XML documents with structure preservation | | **EPUB** | `.epub` | ✅ Supported | E-book files with metadata and content | +| **Email** | `.eml` | ✅ Supported | Email files with headers, content, and attachment info | | **ZIP** | `.zip` | ✅ Supported | Archive processing with recursive file conversion | | **Jupyter Notebook** | `.ipynb` | ✅ Supported | Python notebooks with code and markdown cells | | **RSS/Atom Feeds** | `.rss`, `.atom`, `.xml` | ✅ Supported | Web feeds with structured content and metadata | @@ -199,6 +200,32 @@ var result = await markItDown.ConvertAsync(stream, streamInfo); Console.WriteLine(result.Title); ``` +### Convert email files (EML) + +```csharp +using MarkItDown; + +// Convert an EML file to Markdown +var markItDown = new MarkItDown(); +DocumentConverterResult result = await markItDown.ConvertAsync("message.eml"); + +// The result includes email headers and content +Console.WriteLine($"Subject: {result.Title}"); +Console.WriteLine(result.Markdown); +// Output includes: +// # Email +// **Subject:** Important Project Update +// **From:** sender@example.com +// **To:** recipient@example.com +// **Date:** 2024-01-15 10:30:00 +00:00 +// +// ## Message Content +// [Email body content converted to Markdown] +// +// ## Attachments (if any) +// - file.pdf (application/pdf) - 1.2 MB +``` + ### Convert content from HTTP/HTTPS ```csharp diff --git a/src/MarkItDown/Converters/EmlConverter.cs b/src/MarkItDown/Converters/EmlConverter.cs new file mode 100644 index 000000000..e7068a1e9 --- /dev/null +++ b/src/MarkItDown/Converters/EmlConverter.cs @@ -0,0 +1,265 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; +using MimeKit; + +namespace MarkItDown.Converters; + +/// +/// Converter for EML (email) files that extracts headers, content, and attachment metadata. +/// +public sealed class EmlConverter : IDocumentConverter +{ + private static readonly HashSet AcceptedExtensions = new(StringComparer.OrdinalIgnoreCase) + { + ".eml" + }; + + private static readonly string[] AcceptedMimeTypePrefixes = + { + "message/rfc822", + "message/email", + "application/email", + "text/email" + }; + + private readonly HtmlConverter _htmlConverter; + + public int Priority => 240; // Between PPTX (230) and EPUB (250) - lower numbers = higher priority + + public EmlConverter() + { + _htmlConverter = new HtmlConverter(); + } + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + + if (extension is not null && AcceptedExtensions.Contains(extension)) + return true; + + return MimeTypeUtilities.MatchesAny(normalizedMime, AcceptedMimeTypePrefixes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, AcceptedMimeTypePrefixes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + if (!AcceptsInput(streamInfo)) + return false; + + // For EML files, we rely on extension and MIME type detection + // as parsing the entire message for detection would be expensive + return true; + } + + public async Task ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + try + { + if (stream.CanSeek) + stream.Position = 0; + + var message = await MimeMessage.LoadAsync(stream, cancellationToken).ConfigureAwait(false); + + var markdown = await ConvertEmailToMarkdownAsync(message, cancellationToken).ConfigureAwait(false); + var title = ExtractTitle(message); + + return new DocumentConverterResult(markdown, title); + } + catch (Exception ex) when (ex is not MarkItDownException) + { + throw new MarkItDownException($"Failed to convert EML file: {ex.Message}", ex); + } + } + + private async Task ConvertEmailToMarkdownAsync(MimeMessage message, CancellationToken cancellationToken) + { + var result = new StringBuilder(); + + // Add email headers + result.AppendLine("# Email"); + result.AppendLine(); + + // Essential headers + if (!string.IsNullOrEmpty(message.Subject)) + { + result.AppendLine($"**Subject:** {EscapeMarkdown(message.Subject)}"); + } + + if (message.From?.Count > 0) + { + result.AppendLine($"**From:** {EscapeMarkdown(string.Join(", ", message.From.Select(FormatAddress)))}"); + } + + if (message.To?.Count > 0) + { + result.AppendLine($"**To:** {EscapeMarkdown(string.Join(", ", message.To.Select(FormatAddress)))}"); + } + + if (message.Cc?.Count > 0) + { + result.AppendLine($"**CC:** {EscapeMarkdown(string.Join(", ", message.Cc.Select(FormatAddress)))}"); + } + + if (message.Date != DateTimeOffset.MinValue) + { + result.AppendLine($"**Date:** {message.Date:yyyy-MM-dd HH:mm:ss zzz}"); + } + + // Additional headers if present + if (!string.IsNullOrEmpty(message.MessageId)) + { + result.AppendLine($"**Message-ID:** {EscapeMarkdown(message.MessageId)}"); + } + + result.AppendLine(); + + // Extract message body + var bodyContent = await ExtractBodyContentAsync(message, cancellationToken).ConfigureAwait(false); + if (!string.IsNullOrEmpty(bodyContent)) + { + result.AppendLine("## Message Content"); + result.AppendLine(); + result.AppendLine(bodyContent); + result.AppendLine(); + } + + // List attachments if any + var attachments = ExtractAttachmentInfo(message); + if (attachments.Any()) + { + result.AppendLine("## Attachments"); + result.AppendLine(); + foreach (var attachment in attachments) + { + result.AppendLine($"- **{EscapeMarkdown(attachment.Name)}** ({attachment.ContentType}) - {attachment.Size}"); + } + result.AppendLine(); + } + + return result.ToString().Trim(); + } + + private async Task ExtractBodyContentAsync(MimeMessage message, CancellationToken cancellationToken) + { + if (message.Body == null) + return string.Empty; + + // Try to get HTML content first, then fall back to plain text + var htmlBody = message.HtmlBody; + if (!string.IsNullOrEmpty(htmlBody)) + { + try + { + // Use our HTML converter to convert HTML to Markdown + using var htmlStream = new MemoryStream(Encoding.UTF8.GetBytes(htmlBody)); + var htmlStreamInfo = new StreamInfo(mimeType: "text/html"); + var result = await _htmlConverter.ConvertAsync(htmlStream, htmlStreamInfo, cancellationToken).ConfigureAwait(false); + return result.Markdown; + } + catch + { + // Fall back to plain text if HTML conversion fails + return EscapeMarkdown(htmlBody); + } + } + + // Use plain text content + var textBody = message.TextBody; + return !string.IsNullOrEmpty(textBody) ? EscapeMarkdown(textBody) : string.Empty; + } + + private static List ExtractAttachmentInfo(MimeMessage message) + { + var attachments = new List(); + + foreach (var attachment in message.Attachments) + { + var name = attachment.ContentDisposition?.FileName ?? + attachment.ContentType?.Name ?? + "Unknown"; + + var contentType = attachment.ContentType?.ToString() ?? "application/octet-stream"; + + var size = "Unknown size"; + if (attachment is MimePart part) + { + try + { + // Try to get size from Content-Length header or content disposition + if (part.ContentDisposition?.Size.HasValue == true) + { + size = FileUtilities.FormatFileSize(part.ContentDisposition.Size.Value); + } + else if (part.Headers.Contains("Content-Length")) + { + if (long.TryParse(part.Headers["Content-Length"], out var contentLength)) + { + size = FileUtilities.FormatFileSize(contentLength); + } + } + } + catch + { + // Keep "Unknown size" if we can't determine the size + } + } + + attachments.Add(new AttachmentInfo(name, contentType, size)); + } + + return attachments; + } + + private static string FormatAddress(InternetAddress address) + { + return address switch + { + MailboxAddress mailbox when !string.IsNullOrEmpty(mailbox.Name) => + $"{mailbox.Name} <{mailbox.Address}>", + MailboxAddress mailbox => mailbox.Address, + _ => address.ToString() + }; + } + + private static string ExtractTitle(MimeMessage message) + { + if (!string.IsNullOrEmpty(message.Subject)) + { + return message.Subject.Trim(); + } + + // Fallback to sender information + var sender = message.From?.FirstOrDefault(); + if (sender != null) + { + return $"Email from {FormatAddress(sender)}"; + } + + return "Email Message"; + } + + private static string EscapeMarkdown(string text) + { + if (string.IsNullOrEmpty(text)) + return string.Empty; + + // Escape only the most critical Markdown special characters that would break formatting + // Be less aggressive to preserve readability, especially for email addresses + return text + .Replace("\\", "\\\\") // Escape backslashes first + .Replace("`", "\\`") // Escape backticks + .Replace("*", "\\*") // Escape asterisks + .Replace("_", "\\_"); // Escape underscores + // Don't escape angle brackets, parentheses, and other characters in email contexts + } + + private sealed record AttachmentInfo(string Name, string ContentType, string Size); +} \ No newline at end of file diff --git a/src/MarkItDown/Converters/ZipConverter.cs b/src/MarkItDown/Converters/ZipConverter.cs index b23cc3657..6666f0adc 100644 --- a/src/MarkItDown/Converters/ZipConverter.cs +++ b/src/MarkItDown/Converters/ZipConverter.cs @@ -151,7 +151,7 @@ private async Task ProcessZipEntry(ZipArchiveEntry entry, StringBuilder markdown // Add basic file information if (entry.Length > 0) { - markdown.AppendLine($"**Size:** {FormatFileSize(entry.Length)}"); + markdown.AppendLine($"**Size:** {FileUtilities.FormatFileSize(entry.Length)}"); } if (entry.LastWriteTime != DateTimeOffset.MinValue) @@ -173,7 +173,7 @@ private async Task ProcessZipEntry(ZipArchiveEntry entry, StringBuilder markdown const long maxFileSize = 50 * 1024 * 1024; // 50MB if (entry.Length > maxFileSize) { - markdown.AppendLine($"*File too large to process ({FormatFileSize(entry.Length)})*"); + markdown.AppendLine($"*File too large to process ({FileUtilities.FormatFileSize(entry.Length)})*"); markdown.AppendLine(); return; } @@ -253,19 +253,4 @@ private async Task ProcessZipEntry(ZipArchiveEntry entry, StringBuilder markdown return null; } - - private static string FormatFileSize(long bytes) - { - string[] sizes = { "B", "KB", "MB", "GB" }; - double len = bytes; - int order = 0; - - while (len >= 1024 && order < sizes.Length - 1) - { - order++; - len /= 1024; - } - - return $"{len:0.##} {sizes[order]}"; - } } diff --git a/src/MarkItDown/FileUtilities.cs b/src/MarkItDown/FileUtilities.cs new file mode 100644 index 000000000..2a1c56e28 --- /dev/null +++ b/src/MarkItDown/FileUtilities.cs @@ -0,0 +1,27 @@ +namespace MarkItDown; + +/// +/// Utility class for common file operations and formatting. +/// +internal static class FileUtilities +{ + /// + /// Formats a file size in bytes to a human-readable string with appropriate units. + /// + /// The size in bytes. + /// A formatted string with the size and appropriate unit (B, KB, MB, GB). + public static string FormatFileSize(long bytes) + { + string[] sizes = { "B", "KB", "MB", "GB" }; + double len = bytes; + int order = 0; + + while (len >= 1024 && order < sizes.Length - 1) + { + order++; + len /= 1024; + } + + return $"{len:0.##} {sizes[order]}"; + } +} \ No newline at end of file diff --git a/src/MarkItDown/MarkItDown.cs b/src/MarkItDown/MarkItDown.cs index 0d5bd66e3..1706fc00c 100644 --- a/src/MarkItDown/MarkItDown.cs +++ b/src/MarkItDown/MarkItDown.cs @@ -255,6 +255,7 @@ private IEnumerable CreateBuiltInConverters() new JupyterNotebookConverter(), new CsvConverter(), new EpubConverter(), + new EmlConverter(), new XmlConverter(), new ZipConverter(CreateZipInnerConverters(CreateImageConverter, CreateAudioConverter)), new PdfConverter(), @@ -281,6 +282,7 @@ private IEnumerable CreateZipInnerConverters(Func + + + diff --git a/src/MarkItDown/MimeMapping.cs b/src/MarkItDown/MimeMapping.cs index 9274bafb2..67b613dbb 100644 --- a/src/MarkItDown/MimeMapping.cs +++ b/src/MarkItDown/MimeMapping.cs @@ -39,6 +39,7 @@ internal static class MimeMapping [".m4a"] = "audio/mp4", [".mp4"] = "video/mp4", [".msg"] = "application/vnd.ms-outlook", + [".eml"] = "message/rfc822", }; private static readonly Dictionary MimeToExtension = ExtensionToMime diff --git a/tests/MarkItDown.Tests/EmlConverterTests.cs b/tests/MarkItDown.Tests/EmlConverterTests.cs new file mode 100644 index 000000000..133cf0865 --- /dev/null +++ b/tests/MarkItDown.Tests/EmlConverterTests.cs @@ -0,0 +1,159 @@ +using System; +using System.IO; +using System.Text; +using System.Threading.Tasks; +using MarkItDown; +using MarkItDown.Converters; +using Shouldly; + +namespace MarkItDown.Tests; + +public class EmlConverterTests +{ + private const string SampleEmail = @"Return-Path: +Received: from mail.example.com (mail.example.com [192.168.1.1]) + by recipient.example.com with SMTP; Mon, 15 Jan 2024 10:30:00 +0000 +Date: Mon, 15 Jan 2024 10:30:00 +0000 +From: John Doe +To: Jane Smith +Cc: Team +Subject: Important Project Update +Message-ID: <123456789@example.com> +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 + +Hello Jane, + +I wanted to update you on the current project status: + +1. Phase 1 is complete +2. Phase 2 is in progress +3. Phase 3 starts next week + +Key points: +- Budget is on track +- Timeline looks good +- Team morale is high + +Please let me know if you have any questions. + +Best regards, +John"; + + [Fact] + public async Task ConvertAsync_ValidEmlContent_ReturnsCorrectMarkdown() + { + // Arrange + var converter = new EmlConverter(); + var bytes = Encoding.UTF8.GetBytes(SampleEmail); + using var stream = new MemoryStream(bytes); + var streamInfo = new StreamInfo(mimeType: "message/rfc822", extension: ".eml"); + + // Act + var result = await converter.ConvertAsync(stream, streamInfo); + + // Assert + result.ShouldNotBeNull(); + result.Markdown.ShouldNotBeNullOrWhiteSpace(); + result.Title.ShouldBe("Important Project Update"); + + // Check essential email headers are present + result.Markdown.ShouldContain("**Subject:** Important Project Update"); + result.Markdown.ShouldContain("**From:** John Doe "); + result.Markdown.ShouldContain("**To:** Jane Smith "); + result.Markdown.ShouldContain("**CC:** Team "); + result.Markdown.ShouldContain("**Date:** 2024-01-15 10:30:00 +00:00"); + + // Check message content is included + result.Markdown.ShouldContain("Hello Jane"); + result.Markdown.ShouldContain("Phase 1 is complete"); + result.Markdown.ShouldContain("Best regards"); + result.Markdown.ShouldContain("John"); + } + + [Fact] + public async Task ConvertAsync_EmailWithoutSubject_UsesFromAsFallbackTitle() + { + // Arrange + var emailWithoutSubject = @"Date: Mon, 15 Jan 2024 10:30:00 +0000 +From: John Doe +To: Jane Smith +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 + +This is a simple message without a subject."; + + var converter = new EmlConverter(); + var bytes = Encoding.UTF8.GetBytes(emailWithoutSubject); + using var stream = new MemoryStream(bytes); + var streamInfo = new StreamInfo(mimeType: "message/rfc822"); + + // Act + var result = await converter.ConvertAsync(stream, streamInfo); + + // Assert + result.ShouldNotBeNull(); + result.Title.ShouldBe("Email from John Doe "); + } + + [Fact] + public async Task ConvertAsync_EmailWithHtmlContent_ConvertsHtmlToMarkdown() + { + // Arrange + var htmlEmail = @"Date: Mon, 15 Jan 2024 10:30:00 +0000 +From: sender@example.com +To: recipient@example.com +Subject: HTML Test +MIME-Version: 1.0 +Content-Type: text/html; charset=UTF-8 + + + +

Welcome

+

This is bold text and italic text.

+
    +
  • Item 1
  • +
  • Item 2
  • +
+ +"; + + var converter = new EmlConverter(); + var bytes = Encoding.UTF8.GetBytes(htmlEmail); + using var stream = new MemoryStream(bytes); + var streamInfo = new StreamInfo(mimeType: "message/rfc822"); + + // Act + var result = await converter.ConvertAsync(stream, streamInfo); + + // Assert + result.ShouldNotBeNull(); + result.Markdown.ShouldNotBeNullOrWhiteSpace(); + result.Title.ShouldBe("HTML Test"); + + // Check that HTML was converted to Markdown + result.Markdown.ShouldContain("# Welcome"); + result.Markdown.ShouldContain("**bold**"); + result.Markdown.ShouldContain("*italic*"); + } + + [Fact] + public async Task MarkItDown_ConvertAsync_EmlFile_WorksEndToEnd() + { + // Arrange + var markItDown = new global::MarkItDown.MarkItDown(); + var bytes = Encoding.UTF8.GetBytes(SampleEmail); + using var stream = new MemoryStream(bytes); + var streamInfo = new StreamInfo(mimeType: "message/rfc822", extension: ".eml"); + + // Act + var result = await markItDown.ConvertAsync(stream, streamInfo); + + // Assert + result.ShouldNotBeNull(); + result.Markdown.ShouldNotBeNullOrWhiteSpace(); + result.Title.ShouldBe("Important Project Update"); + result.Markdown.ShouldContain("**Subject:** Important Project Update"); + result.Markdown.ShouldContain("Hello Jane"); + } +} \ No newline at end of file diff --git a/tests/MarkItDown.Tests/NewConvertersTests.cs b/tests/MarkItDown.Tests/NewConvertersTests.cs index 302811770..8aa7f9b20 100644 --- a/tests/MarkItDown.Tests/NewConvertersTests.cs +++ b/tests/MarkItDown.Tests/NewConvertersTests.cs @@ -147,6 +147,7 @@ public void AllNewConverters_HaveCorrectPriorities() [InlineData(".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation")] [InlineData(".jpg", "image/jpeg")] [InlineData(".png", "image/png")] + [InlineData(".eml", "message/rfc822")] public void MarkItDown_RegistersNewConverters_CanHandleNewFormats(string extension, string mimeType) { // Arrange @@ -160,4 +161,64 @@ public void MarkItDown_RegistersNewConverters_CanHandleNewFormats(string extensi // Assert canHandle.ShouldBeTrue($"Should have a converter that can handle {extension} files"); } + + [Fact] + public void EmlConverter_AcceptsInput_ValidEmlExtension_ReturnsTrue() + { + // Arrange + var converter = new EmlConverter(); + var streamInfo = new StreamInfo(mimeType: "message/rfc822", extension: ".eml"); + + // Act + var result = converter.AcceptsInput(streamInfo); + + // Assert + result.ShouldBeTrue(); + } + + [Fact] + public void EmlConverter_AcceptsInput_InvalidExtension_ReturnsFalse() + { + // Arrange + var converter = new EmlConverter(); + var streamInfo = new StreamInfo(mimeType: "text/plain", extension: ".txt"); + + // Act + var result = converter.AcceptsInput(streamInfo); + + // Assert + result.ShouldBeFalse(); + } + + [Theory] + [InlineData(".eml", "message/rfc822")] + [InlineData(".eml", "message/email")] + [InlineData(".eml", "application/email")] + [InlineData(".eml", "text/email")] + public void EmlConverter_AcceptsInput_ValidMimeTypes_ReturnsTrue(string extension, string mimeType) + { + // Arrange + var converter = new EmlConverter(); + var streamInfo = new StreamInfo(mimeType: mimeType, extension: extension); + + // Act + var result = converter.AcceptsInput(streamInfo); + + // Assert + result.ShouldBeTrue($"Should accept {extension} files with MIME type {mimeType}"); + } + + [Fact] + public void EmlConverter_Priority_IsBetweenPptxAndEpub() + { + // Arrange + var emlConverter = new EmlConverter(); + var epubConverter = new EpubConverter(); + var pptxConverter = new PptxConverter(); + + // Act & Assert + // Lower number = higher priority, so EML (240) should be between PPTX (230) and EPUB (250) + emlConverter.Priority.ShouldBeGreaterThan(pptxConverter.Priority); + emlConverter.Priority.ShouldBeLessThan(epubConverter.Priority); + } }