Skip to content

Commit 7ec5cf6

Browse files
authored
Merge pull request #23 from managedcode/codex/add-support-for-missing-pandoc-formats
embed odt fixture inline
2 parents 8ec74c2 + 94126f8 commit 7ec5cf6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+3696
-152
lines changed

README.md

Lines changed: 27 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ dotnet add package ManagedCode.MarkItDown
199199
using MarkItDown;
200200

201201
// Create converter instance
202-
var markItDown = new MarkItDown();
202+
var markItDown = new MarkItDownClient();
203203

204204
// Convert any file to Markdown
205205
var result = await markItDown.ConvertAsync("document.pdf");
@@ -218,7 +218,7 @@ using Microsoft.Extensions.Logging;
218218
// Set up logging to track conversion progress
219219
using var loggerFactory = LoggerFactory.Create(builder => builder.AddConsole());
220220
var logger = loggerFactory.CreateLogger<MarkItDown>();
221-
var markItDown = new MarkItDown(logger: logger);
221+
var markItDown = new MarkItDownClient(logger: logger);
222222

223223
// Convert documents for vector database ingestion
224224
string[] documents = { "report.pdf", "data.xlsx", "webpage.html" };
@@ -245,7 +245,7 @@ foreach (var doc in documents)
245245
```csharp
246246
using MarkItDown;
247247

248-
var markItDown = new MarkItDown();
248+
var markItDown = new MarkItDownClient();
249249
var emailFolder = @"C:\Emails\Exports";
250250
var outputFolder = @"C:\ProcessedEmails";
251251

@@ -271,7 +271,7 @@ using Microsoft.Extensions.Logging;
271271
using var loggerFactory = LoggerFactory.Create(builder => builder.AddConsole());
272272
using var httpClient = new HttpClient();
273273

274-
var markItDown = new MarkItDown(
274+
var markItDown = new MarkItDownClient(
275275
logger: loggerFactory.CreateLogger<MarkItDown>(),
276276
httpClient: httpClient);
277277

@@ -310,7 +310,7 @@ foreach (var url in urls)
310310
using MarkItDown;
311311

312312
// Convert a DOCX file and print the Markdown
313-
var markItDown = new MarkItDown();
313+
var markItDown = new MarkItDownClient();
314314
DocumentConverterResult result = await markItDown.ConvertAsync("report.docx");
315315
Console.WriteLine(result.Markdown);
316316
```
@@ -329,7 +329,7 @@ var streamInfo = new StreamInfo(
329329
charset: Encoding.UTF8,
330330
fileName: "invoice.html");
331331

332-
var markItDown = new MarkItDown();
332+
var markItDown = new MarkItDownClient();
333333
var result = await markItDown.ConvertAsync(stream, streamInfo);
334334
Console.WriteLine(result.Title);
335335
```
@@ -340,7 +340,7 @@ Console.WriteLine(result.Title);
340340
using MarkItDown;
341341

342342
// Convert an EML file to Markdown
343-
var markItDown = new MarkItDown();
343+
var markItDown = new MarkItDownClient();
344344
DocumentConverterResult result = await markItDown.ConvertAsync("message.eml");
345345

346346
// The result includes email headers and content
@@ -369,7 +369,7 @@ using Microsoft.Extensions.Logging;
369369
using var loggerFactory = LoggerFactory.Create(static builder => builder.AddConsole());
370370
using var httpClient = new HttpClient();
371371

372-
var markItDown = new MarkItDown(
372+
var markItDown = new MarkItDownClient(
373373
logger: loggerFactory.CreateLogger<MarkItDown>(),
374374
httpClient: httpClient);
375375

@@ -462,7 +462,7 @@ var options = new MarkItDownOptions
462462
}
463463
};
464464

465-
var markItDown = new MarkItDown(options);
465+
var markItDown = new MarkItDownClient(options);
466466

467467
// Segments are still available programmatically even when annotations are disabled.
468468
```
@@ -494,7 +494,7 @@ public sealed class MyCustomConverter : IDocumentConverter
494494
}
495495
}
496496

497-
var markItDown = new MarkItDown();
497+
var markItDown = new MarkItDownClient();
498498
markItDown.RegisterConverter(new MyCustomConverter());
499499
```
500500

@@ -546,7 +546,7 @@ public class DocumentProcessor
546546
public DocumentProcessor(ILogger<DocumentProcessor> logger)
547547
{
548548
_logger = logger;
549-
_markItDown = new MarkItDown(logger: logger);
549+
_markItDown = new MarkItDownClient(logger: logger);
550550
}
551551

552552
public async Task<List<ProcessedDocument>> ProcessDirectoryAsync(
@@ -611,7 +611,7 @@ public class DocumentIndexer
611611
public DocumentIndexer(IVectorStore vectorStore)
612612
{
613613
_vectorStore = vectorStore;
614-
_markItDown = new MarkItDown();
614+
_markItDown = new MarkItDownClient();
615615
}
616616

617617
public async Task IndexDocumentAsync<T>(string filePath) where T : class
@@ -690,7 +690,7 @@ public class DocumentConversionFunction
690690
public DocumentConversionFunction(ILogger<DocumentConversionFunction> logger)
691691
{
692692
_logger = logger;
693-
_markItDown = new MarkItDown(logger: logger);
693+
_markItDown = new MarkItDownClient(logger: logger);
694694
}
695695

696696
[Function("ConvertDocument")]
@@ -812,6 +812,8 @@ MarkItDown exposes optional abstractions for running documents through cloud ser
812812

813813
The `AzureIntelligenceOptions`, `GoogleIntelligenceOptions`, and `AwsIntelligenceOptions` helpers wire the respective cloud Document AI/Vision/Speech stacks without forcing the dependency on consumers. You can still bring your own implementation by assigning the provider interfaces directly on `MarkItDownOptions`.
814814

815+
`MarkItDownClient` emits structured `ILogger` events and OpenTelemetry spans by default. Toggle instrumentation with `MarkItDownOptions.EnableTelemetry`, supply a custom `ActivitySource`/`Meter`, or provide a `LoggerFactory` to integrate with your application's logging pipeline.
816+
815817
#### Azure AI setup (keys and managed identity)
816818

817819
- **Docs**: [Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/), [Computer Vision Image Analysis](https://learn.microsoft.com/azure/ai-services/computer-vision/overview-image-analysis), [Video Indexer authentication](https://learn.microsoft.com/azure/azure-video-indexer/video-indexer-get-started/connect-to-azure).
@@ -943,7 +945,7 @@ For LLM-style post-processing, assign `MarkItDownOptions.AiModels` with an `IAiM
943945
```csharp
944946
using MarkItDown;
945947

946-
var markItDown = new MarkItDown();
948+
var markItDown = new MarkItDownClient();
947949

948950
try
949951
{
@@ -1010,7 +1012,7 @@ using var httpClient = new HttpClient();
10101012
httpClient.Timeout = TimeSpan.FromSeconds(30);
10111013
httpClient.DefaultRequestHeaders.Add("User-Agent", "MarkItDown/1.0");
10121014

1013-
var markItDown = new MarkItDown(httpClient: httpClient);
1015+
var markItDown = new MarkItDownClient(httpClient: httpClient);
10141016
```
10151017

10161018
**Logging for Diagnostics:**
@@ -1021,7 +1023,7 @@ using var loggerFactory = LoggerFactory.Create(builder =>
10211023
builder.AddConsole().SetMinimumLevel(LogLevel.Debug));
10221024

10231025
var logger = loggerFactory.CreateLogger<MarkItDown>();
1024-
var markItDown = new MarkItDown(logger: logger);
1026+
var markItDown = new MarkItDownClient(logger: logger);
10251027

10261028
// Now you'll see detailed conversion progress in console output
10271029
```
@@ -1034,7 +1036,7 @@ If you're familiar with the original Python library, here are the key difference
10341036

10351037
| Python | C#/.NET | Notes |
10361038
|---------|---------|--------|
1037-
| `MarkItDown()` | `new MarkItDown()` | Similar constructor |
1039+
| `MarkItDownClient()` | `new MarkItDownClient()` | Similar constructor |
10381040
| `markitdown.convert("file.pdf")` | `await markItDown.ConvertAsync("file.pdf")` | Async pattern |
10391041
| `markitdown.convert(stream, file_extension=".pdf")` | `await markItDown.ConvertAsync(stream, streamInfo)` | StreamInfo object |
10401042
| `markitdown.convert_url("https://...")` | `await markItDown.ConvertFromUrlAsync("https://...")` | Async URL conversion |
@@ -1046,15 +1048,15 @@ If you're familiar with the original Python library, here are the key difference
10461048
```python
10471049
# Python version
10481050
import markitdown
1049-
md = markitdown.MarkItDown()
1051+
md = markitdown.MarkItDownClient()
10501052
result = md.convert("document.pdf")
10511053
print(result.text_content)
10521054
```
10531055

10541056
```csharp
10551057
// C# version
10561058
using MarkItDown;
1057-
var markItDown = new MarkItDown();
1059+
var markItDown = new MarkItDownClient();
10581060
var result = await markItDown.ConvertAsync("document.pdf");
10591061
Console.WriteLine(result.Markdown);
10601062
```
@@ -1170,7 +1172,7 @@ Performance will vary based on your specific documents and environment. For prod
11701172

11711173
```csharp
11721174
// 1. Reuse MarkItDown instances (they're thread-safe)
1173-
var markItDown = new MarkItDown();
1175+
var markItDown = new MarkItDownClient();
11741176
await Task.WhenAll(
11751177
markItDown.ConvertAsync("file1.pdf"),
11761178
markItDown.ConvertAsync("file2.docx"),
@@ -1183,7 +1185,7 @@ var result = await markItDown.ConvertAsync("large-file.pdf", cancellationToken:
11831185

11841186
// 3. Configure HttpClient for web content (reuse connections)
11851187
using var httpClient = new HttpClient();
1186-
var markItDown = new MarkItDown(httpClient: httpClient);
1188+
var markItDown = new MarkItDownClient(httpClient: httpClient);
11871189

11881190
// 4. Pre-specify StreamInfo to skip format detection
11891191
var streamInfo = new StreamInfo(mimeType: "application/pdf", extension: ".pdf");
@@ -1202,7 +1204,7 @@ var options = new MarkItDownOptions
12021204
ExifToolPath = "/usr/local/bin/exiftool" // Path to exiftool binary (optional)
12031205
};
12041206

1205-
var markItDown = new MarkItDown(options);
1207+
var markItDown = new MarkItDownClient(options);
12061208
```
12071209

12081210
### Advanced AI Integration
@@ -1233,7 +1235,7 @@ var options = new MarkItDownOptions
12331235
}
12341236
};
12351237

1236-
var markItDown = new MarkItDown(options);
1238+
var markItDown = new MarkItDownClient(options);
12371239
```
12381240

12391241
### Conversion Middleware & Raw Artifacts
@@ -1250,7 +1252,7 @@ var options = new MarkItDownOptions
12501252
}
12511253
};
12521254

1253-
var markItDown = new MarkItDown(options);
1255+
var markItDown = new MarkItDownClient(options);
12541256
var result = await markItDown.ConvertAsync("docs/diagram.docx");
12551257

12561258
foreach (var image in result.Artifacts.Images)
@@ -1294,7 +1296,7 @@ var options = new MarkItDownOptions
12941296
}
12951297
};
12961298

1297-
var markItDown = new MarkItDown(options, logger, httpClientFactory.CreateClient());
1299+
var markItDown = new MarkItDownClient(options, logger, httpClientFactory.CreateClient());
12981300
```
12991301

13001302
## 📄 License

docs/MetaMD.md

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# MetaMD (MMD)
2+
3+
MetaMD is a Markdown profile that layers structured metadata and citation-aware rendering on top of CommonMark. Files typically use the `.metamd` extension (optionally `.metamd.md`) and begin with a JSON front matter block delimited by `+++` fences.
4+
5+
## Front Matter Schema
6+
7+
```json
8+
{
9+
"title": "Document title",
10+
"abstract": "Optional abstract text.",
11+
"contributors": ["Name", "Name"],
12+
"affiliations": ["Organisation"],
13+
"keywords": ["term", "term"],
14+
"references": [
15+
{
16+
"id": "unique-id",
17+
"title": "Reference title",
18+
"authors": ["Author"],
19+
"url": "https://example.com/reference"
20+
}
21+
]
22+
}
23+
```
24+
25+
All properties are optional. Unknown properties are ignored by the converter.
26+
27+
## Reference Syntax
28+
29+
Inline citations use `[@id]`. During conversion each citation is replaced with a Markdown link if a URL is present, or bold text when the reference has no URL. Referenced entries are collected and emitted in a `## References` section at the end of the document, preserving author lists and links.
30+
31+
## Diagram Blocks
32+
33+
MetaMD supports lightweight diagram embedding via custom blocks:
34+
35+
```
36+
:::diagram type="mermaid"
37+
<diagram body>
38+
:::
39+
```
40+
41+
The converter rewrites these blocks as fenced code blocks using the requested diagram type (e.g., `mermaid`, `dot`, `plantuml`).
42+
43+
## Compatibility
44+
45+
Because MetaMD is a superset of Markdown, downstream tools that do not recognise the front matter or diagram directives still render the body content. The .NET converter automatically recognises `.metamd` and `.metamd.md` files, extracts metadata into headings, and normalises references for consistent Markdown output.
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Linq;
5+
using System.Text;
6+
using System.Text.RegularExpressions;
7+
using System.Threading;
8+
using System.Threading.Tasks;
9+
using ManagedCode.MimeTypes;
10+
11+
namespace MarkItDown.Converters;
12+
13+
/// <summary>
14+
/// Converter for AsciiDoc documents.
15+
/// </summary>
16+
public sealed class AsciiDocConverter : IDocumentConverter
17+
{
18+
private static readonly IReadOnlyCollection<string> Extensions = new[]
19+
{
20+
".adoc",
21+
".asciidoc",
22+
};
23+
24+
private static readonly IReadOnlyCollection<string> MimeTypes = new[]
25+
{
26+
MimeHelper.GetMimeType(".adoc") ?? MimeTypeUtilities.Compose("text", "asciidoc"),
27+
MimeHelper.GetMimeType(".asciidoc") ?? MimeTypeUtilities.Compose("text", "asciidoc"),
28+
};
29+
30+
private static readonly Regex Bold = new("\\*(?<text>[^*]+)\\*", RegexOptions.Compiled);
31+
private static readonly Regex Italic = new("_(?<text>[^_]+)_", RegexOptions.Compiled);
32+
private static readonly Regex Monospace = new("`(?<text>[^`]+)`", RegexOptions.Compiled);
33+
34+
public int Priority => 160;
35+
36+
public bool AcceptsInput(StreamInfo streamInfo)
37+
{
38+
var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo);
39+
var extension = streamInfo.Extension?.ToLowerInvariant();
40+
if (extension is not null && Extensions.Contains(extension))
41+
{
42+
return true;
43+
}
44+
45+
return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes)
46+
|| MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes);
47+
}
48+
49+
public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo);
50+
51+
public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default)
52+
{
53+
using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true);
54+
var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false);
55+
var markdown = ConvertToMarkdown(content);
56+
return new DocumentConverterResult(markdown, streamInfo.FileName);
57+
}
58+
59+
private static string ConvertToMarkdown(string adoc)
60+
{
61+
var lines = adoc.Replace("\r\n", "\n").Split('\n');
62+
var builder = new StringBuilder();
63+
foreach (var line in lines)
64+
{
65+
var trimmed = line.TrimEnd();
66+
if (string.IsNullOrWhiteSpace(trimmed))
67+
{
68+
builder.AppendLine();
69+
continue;
70+
}
71+
72+
if (trimmed.StartsWith("= "))
73+
{
74+
var level = trimmed.TakeWhile(c => c == '=').Count();
75+
builder.AppendLine(new string('#', Math.Clamp(level, 1, 6)) + " " + trimmed[level..].Trim());
76+
continue;
77+
}
78+
79+
if (trimmed.StartsWith("=="))
80+
{
81+
var level = trimmed.TakeWhile(c => c == '=').Count();
82+
builder.AppendLine(new string('#', Math.Clamp(level, 1, 6)) + " " + trimmed[level..].Trim());
83+
continue;
84+
}
85+
86+
if (trimmed.StartsWith("*") || trimmed.StartsWith("-") || trimmed.StartsWith("."))
87+
{
88+
var marker = trimmed[0] == '.' ? "1." : "-";
89+
builder.AppendLine(marker + " " + trimmed[1..].Trim());
90+
continue;
91+
}
92+
93+
var converted = Bold.Replace(trimmed, m => "**" + m.Groups["text"].Value + "**");
94+
converted = Italic.Replace(converted, m => "*" + m.Groups["text"].Value + "*");
95+
converted = Monospace.Replace(converted, m => "`" + m.Groups["text"].Value + "`");
96+
97+
builder.AppendLine(converted);
98+
}
99+
100+
return builder.ToString().Trim();
101+
}
102+
}

0 commit comments

Comments
 (0)