Skip to content

Commit 117be31

Browse files
committed
cli: expose metadata summary
1 parent 9838d09 commit 117be31

File tree

14 files changed

+246
-45
lines changed

14 files changed

+246
-45
lines changed

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ If I tell you to remember something, you do the same, update
3131
- Image placeholders must emit Markdown image links (`![alt](file.png)`) that reference persisted artifacts; only fall back to bold text when no file is available.
3232
- If AI image enrichment yields no insight, log and continue instead of throwing—treat empty payloads as a soft failure.
3333
- When executing tests, always include the `ManualConversionDebugTests` suite; treat its failures as blocking.
34+
- Always run the full test suite after making changes and share the results with the user.
3435
- Telemetry work: instrument both overall document processing time and per-page duration with real metrics alongside traces—include histogram/counter coverage so latency is observable at both levels.
3536
- For large converters, structure them as partial classes and split related files into a dedicated subfolder.
3637
- Markdown hygiene: strip non-breaking, zero-width, or other non-printable spaces; replace them with regular ASCII spaces so output never contains invisible characters like the long space before `Add`.

Directory.Packages.props

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@
4141
<PackageVersion Include="xunit" Version="2.9.3" />
4242
<PackageVersion Include="xunit.runner.visualstudio" Version="3.1.4" />
4343
</ItemGroup>
44-
</Project>
44+
</Project>

README.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,30 @@ Console.WriteLine(result.Markdown.Length);
266266
- `DocumentConverterResult` exposes `Markdown`, `Title`, `Segments`, `Artifacts`, and `Metadata` for downstream processing.
267267
- Apply custom behaviour through `MarkItDownOptions` (segment settings, AI providers, middleware) when constructing the client.
268268

269+
### Metadata Keys
270+
271+
The `MetadataKeys` static class centralises every metadata field the converters emit so you never have to guess string names. Use these constants when inspecting `DocumentConverterResult.Metadata`, per-segment metadata, or artifact metadata:
272+
273+
```csharp
274+
await using var client = new MarkItDownClient();
275+
var result = await client.ConvertAsync(path);
276+
277+
if (result.Metadata.TryGetValue(MetadataKeys.DocumentTitle, out var title))
278+
{
279+
Console.WriteLine($"Detected title: {title}");
280+
}
281+
282+
foreach (var table in result.Artifacts.Tables)
283+
{
284+
if (table.Metadata.TryGetValue(MetadataKeys.TableComment, out var comment))
285+
{
286+
Console.WriteLine(comment);
287+
}
288+
}
289+
```
290+
291+
Notable keys include `MetadataKeys.TableComment` (table span hints), `MetadataKeys.EmailAttachments` (EML attachment summary), `MetadataKeys.NotebookCellsCount` (Jupyter statistics), and `MetadataKeys.ArchiveEntry` (ZIP entry provenance). Refer to `src/MarkItDown/Utilities/MetadataKeys.cs` for the full catalog; new format handlers add their metadata there so downstream consumers can rely on stable identifiers.
292+
269293
### CLI
270294

271295
Prefer a guided experience? Run the bundled CLI to batch files or URLs:
@@ -276,6 +300,18 @@ dotnet run --project src/MarkItDown.Cli -- path/to/input
276300

277301
Use `dotnet publish` with your preferred runtime identifier if you need a self-contained binary.
278302

303+
Each run now surfaces the document title plus quick stats (pages, images, tables, attachments) in the conversion summary. These numbers come straight from `MetadataKeys` so the CLI mirrors what you see when processing results programmatically.
304+
305+
#### Cloud Provider Configuration Prompts
306+
307+
Choose **Configure cloud providers** in the CLI to register AI integrations without writing code. The prompts map directly to the corresponding option objects:
308+
309+
- **Azure**`AzureIntelligenceOptions` (`DocumentIntelligence`, `Vision`, `Media`) and supports endpoints, API keys/tokens, and Video Indexer account metadata.
310+
- **Google**`GoogleIntelligenceOptions` with credentials for Vertex AI or Speech services.
311+
- **AWS**`AwsIntelligenceOptions` for Rekognition/Transcribe style integrations.
312+
313+
You can leave a prompt blank to keep the current value, or enter `-` to clear it. The saved settings are applied to every subsequent conversion until you change them or use **Clear all**. Combine these prompts with the metadata counts above to validate that enrichment providers are wired up correctly.
314+
279315
## 🏗️ Architecture
280316

281317
### Core Components

src/MarkItDown.Cli/ConversionResult.cs

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,18 @@
22

33
namespace MarkItDown.Cli;
44

5-
internal sealed record ConversionResult(string Input, string? Output, bool Success, string? Error, int SegmentCount);
5+
internal sealed record ConversionResult(
6+
string Input,
7+
string? Output,
8+
bool Success,
9+
string? Error,
10+
int SegmentCount,
11+
string? Title,
12+
int PageCount,
13+
int ImageCount,
14+
int TableCount,
15+
int AttachmentCount,
16+
string? AttachmentSummary);
617

718
internal sealed class ConversionSummary
819
{
@@ -16,6 +27,14 @@ public ConversionSummary(IReadOnlyList<ConversionResult> results)
1627
public int SuccessCount => Results.Count(r => r.Success);
1728

1829
public int FailureCount => Results.Count - SuccessCount;
30+
31+
public int TotalPages => Results.Sum(r => r.PageCount);
32+
33+
public int TotalImages => Results.Sum(r => r.ImageCount);
34+
35+
public int TotalTables => Results.Sum(r => r.TableCount);
36+
37+
public int TotalAttachments => Results.Sum(r => r.AttachmentCount);
1938
}
2039

2140
internal readonly record struct ConversionProgress(int Processed, int Total, string Current);

src/MarkItDown.Cli/ConversionService.cs

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using System;
22
using System.Collections.Generic;
3+
using System.Globalization;
34
using System.IO;
45
using System.Linq;
56
using System.Text;
@@ -30,13 +31,14 @@ public async Task<ConversionSummary> ConvertFilesAsync(IReadOnlyList<string>? fi
3031

3132
try
3233
{
33-
var conversion = await markItDown.ConvertAsync(file, cancellationToken).ConfigureAwait(false);
34-
var outputPath = await WriteMarkdownAsync(conversion.Markdown, file, outputDirectory, cancellationToken).ConfigureAwait(false);
35-
results.Add(new ConversionResult(file, outputPath, true, null, conversion.Segments.Count));
34+
await using var conversion = await markItDown.ConvertAsync(file, cancellationToken).ConfigureAwait(false);
35+
var markdown = conversion.Markdown;
36+
var outputPath = await WriteMarkdownAsync(markdown, file, outputDirectory, cancellationToken).ConfigureAwait(false);
37+
results.Add(CreateSuccessResult(conversion, file, outputPath));
3638
}
3739
catch (Exception ex)
3840
{
39-
results.Add(new ConversionResult(file, null, false, ex.Message, 0));
41+
results.Add(new ConversionResult(file, null, false, ex.Message, 0, null, 0, 0, 0, 0, null));
4042
}
4143

4244
progress?.Report(new ConversionProgress(index + 1, files.Count, file));
@@ -54,9 +56,11 @@ public async Task<ConversionSummary> ConvertUrlAsync(string url, string outputDi
5456

5557
Directory.CreateDirectory(outputDirectory);
5658
var markItDown = new MarkItDownClient(options);
57-
var conversion = await markItDown.ConvertFromUrlAsync(url, cancellationToken: cancellationToken).ConfigureAwait(false);
58-
var outputPath = await WriteMarkdownAsync(conversion.Markdown, DeriveFileNameFromUrl(url, conversion.Title), outputDirectory, cancellationToken).ConfigureAwait(false);
59-
var result = new ConversionResult(url, outputPath, true, null, conversion.Segments.Count);
59+
await using var conversion = await markItDown.ConvertFromUrlAsync(url, cancellationToken: cancellationToken).ConfigureAwait(false);
60+
var markdown = conversion.Markdown;
61+
var fileName = DeriveFileNameFromUrl(url, ResolveTitle(conversion));
62+
var outputPath = await WriteMarkdownAsync(markdown, fileName, outputDirectory, cancellationToken).ConfigureAwait(false);
63+
var result = CreateSuccessResult(conversion, url, outputPath);
6064
return new ConversionSummary(new[] { result });
6165
}
6266

@@ -92,4 +96,58 @@ private static async Task<string> WriteMarkdownAsync(string? markdown, string in
9296
await File.WriteAllTextAsync(path, markdown ?? string.Empty, Encoding.UTF8, cancellationToken).ConfigureAwait(false);
9397
return path;
9498
}
99+
100+
private static ConversionResult CreateSuccessResult(DocumentConverterResult conversion, string input, string outputPath)
101+
{
102+
var metadata = conversion.Metadata ?? new Dictionary<string, string>();
103+
var title = ResolveTitle(conversion);
104+
var pageCount = ParseCount(metadata, MetadataKeys.DocumentPages);
105+
var imageCount = conversion.Artifacts?.Images?.Count ?? 0;
106+
var tableCount = conversion.Artifacts?.Tables?.Count ?? 0;
107+
var attachmentCount = ParseCount(metadata, MetadataKeys.EmailAttachmentsCount);
108+
var attachmentSummary = TryGetValue(metadata, MetadataKeys.EmailAttachments);
109+
110+
return new ConversionResult(
111+
input,
112+
outputPath,
113+
true,
114+
null,
115+
conversion.Segments.Count,
116+
title,
117+
pageCount,
118+
imageCount,
119+
tableCount,
120+
attachmentCount,
121+
string.IsNullOrWhiteSpace(attachmentSummary) ? null : attachmentSummary);
122+
}
123+
124+
private static string? ResolveTitle(DocumentConverterResult conversion)
125+
{
126+
if (!string.IsNullOrWhiteSpace(conversion.Title))
127+
{
128+
return conversion.Title;
129+
}
130+
131+
if (conversion.Metadata.TryGetValue(MetadataKeys.DocumentTitle, out var title) && !string.IsNullOrWhiteSpace(title))
132+
{
133+
return title;
134+
}
135+
136+
return null;
137+
}
138+
139+
private static int ParseCount(IReadOnlyDictionary<string, string> metadata, string key)
140+
{
141+
if (metadata.TryGetValue(key, out var value) &&
142+
int.TryParse(value, NumberStyles.Integer, CultureInfo.InvariantCulture, out var parsed) &&
143+
parsed >= 0)
144+
{
145+
return parsed;
146+
}
147+
148+
return 0;
149+
}
150+
151+
private static string? TryGetValue(IReadOnlyDictionary<string, string> metadata, string key)
152+
=> metadata.TryGetValue(key, out var value) ? value : null;
95153
}

src/MarkItDown.Cli/InteractiveCli.cs

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,6 @@ private async Task ConvertDirectoryAsync()
136136
var summary = await RunWithProgressAsync(progress => conversionService.ConvertFilesAsync(files, outputDir, options, progress), files.Count);
137137
RenderSummary(summary);
138138
PromptToOpenDirectory(outputDir, summary);
139-
PromptToOpenDirectory(outputDir, summary);
140139
}
141140

142141
private async Task ConvertUrlAsync()
@@ -378,6 +377,8 @@ private void RenderSummary(ConversionSummary summary)
378377
{
379378
var table = new Table().Border(TableBorder.Rounded).Title("Conversion Summary");
380379
table.AddColumn("Input");
380+
table.AddColumn("Title");
381+
table.AddColumn("Highlights");
381382
table.AddColumn("Output");
382383
table.AddColumn("Segments");
383384
table.AddColumn("Status");
@@ -387,6 +388,8 @@ private void RenderSummary(ConversionSummary summary)
387388
var status = result.Success ? "[green]Success[/]" : $"[red]{Markup.Escape(result.Error ?? "Failed")}[/]";
388389
table.AddRow(
389390
Markup.Escape(result.Input),
391+
string.IsNullOrWhiteSpace(result.Title) ? "-" : Markup.Escape(result.Title),
392+
FormatHighlights(result),
390393
result.Output is null ? "-" : Markup.Escape(result.Output),
391394
result.SegmentCount.ToString(),
392395
status);
@@ -396,6 +399,62 @@ private void RenderSummary(ConversionSummary summary)
396399
var total = summary.Results.Count;
397400
var successPercent = total == 0 ? 0 : (double)summary.SuccessCount / total * 100d;
398401
AnsiConsole.MarkupLine($"[green]Completed[/]: {summary.SuccessCount}/{total} succeeded ({successPercent:0.##}%), [red]{summary.FailureCount} failed[/].");
402+
403+
var totals = BuildTotalsSummary(summary);
404+
if (!string.IsNullOrWhiteSpace(totals))
405+
{
406+
AnsiConsole.MarkupLine(totals);
407+
}
408+
}
409+
410+
private static string FormatHighlights(ConversionResult result)
411+
{
412+
var parts = new List<string>();
413+
if (result.PageCount > 0)
414+
{
415+
parts.Add($"Pages: {result.PageCount}");
416+
}
417+
if (result.ImageCount > 0)
418+
{
419+
parts.Add($"Images: {result.ImageCount}");
420+
}
421+
if (result.TableCount > 0)
422+
{
423+
parts.Add($"Tables: {result.TableCount}");
424+
}
425+
if (result.AttachmentCount > 0)
426+
{
427+
parts.Add($"Attachments: {result.AttachmentCount}");
428+
}
429+
430+
return parts.Count == 0
431+
? "-"
432+
: Markup.Escape(string.Join(", ", parts));
433+
}
434+
435+
private static string? BuildTotalsSummary(ConversionSummary summary)
436+
{
437+
var aggregate = new List<string>();
438+
if (summary.TotalPages > 0)
439+
{
440+
aggregate.Add($"pages: {summary.TotalPages}");
441+
}
442+
if (summary.TotalImages > 0)
443+
{
444+
aggregate.Add($"images: {summary.TotalImages}");
445+
}
446+
if (summary.TotalTables > 0)
447+
{
448+
aggregate.Add($"tables: {summary.TotalTables}");
449+
}
450+
if (summary.TotalAttachments > 0)
451+
{
452+
aggregate.Add($"attachments: {summary.TotalAttachments}");
453+
}
454+
455+
return aggregate.Count == 0
456+
? null
457+
: $"[grey]Aggregate[/]: {string.Join(", ", aggregate)}.";
399458
}
400459

401460
private void PreviewFile()

src/MarkItDown/Converters/Documents/EmlConverter.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ public override async Task<DocumentConverterResult> ConvertAsync(Stream stream,
9797
[MetadataKeys.DocumentPages] = extraction.Segments.Count.ToString(CultureInfo.InvariantCulture),
9898
[MetadataKeys.DocumentImages] = extraction.Artifacts.Images.Count.ToString(CultureInfo.InvariantCulture),
9999
[MetadataKeys.DocumentTables] = extraction.Artifacts.Tables.Count.ToString(CultureInfo.InvariantCulture),
100-
["email.attachments.count"] = conversion.Attachments.Count.ToString(CultureInfo.InvariantCulture),
100+
[MetadataKeys.EmailAttachmentsCount] = conversion.Attachments.Count.ToString(CultureInfo.InvariantCulture),
101101
[MetadataKeys.WorkspaceDirectory] = workspace.DirectoryPath
102102
};
103103

@@ -108,7 +108,7 @@ public override async Task<DocumentConverterResult> ConvertAsync(Stream stream,
108108

109109
if (conversion.Attachments.Count > 0)
110110
{
111-
metadata["email.attachments"] = string.Join("; ", conversion.Attachments.Select(static attachment => attachment.Name));
111+
metadata[MetadataKeys.EmailAttachments] = string.Join("; ", conversion.Attachments.Select(static attachment => attachment.Name));
112112
}
113113

114114
foreach (var pair in extraction.Artifacts.Metadata)
@@ -357,8 +357,8 @@ private EmlExtractionResult BuildExtraction(string markdown, StreamInfo streamIn
357357

358358
if (attachments.Count > 0)
359359
{
360-
artifacts.Metadata["email.attachments.count"] = attachments.Count.ToString(CultureInfo.InvariantCulture);
361-
artifacts.Metadata["email.attachments"] = string.Join("; ", attachments.Select(static attachment => attachment.Name));
360+
artifacts.Metadata[MetadataKeys.EmailAttachmentsCount] = attachments.Count.ToString(CultureInfo.InvariantCulture);
361+
artifacts.Metadata[MetadataKeys.EmailAttachments] = string.Join("; ", attachments.Select(static attachment => attachment.Name));
362362
}
363363

364364
return new EmlExtractionResult(segments, artifacts);

src/MarkItDown/Converters/Documents/EpubConverter.cs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -269,13 +269,13 @@ private static async Task<Dictionary<string, string>> ExtractMetadataAsync(ZipAr
269269
}
270270

271271
var manager = BuildNamespaceManager(document);
272-
metadata["title"] = GetFirstTextContent(document, "//opf:metadata/dc:title", manager);
273-
metadata["author"] = string.Join(", ", GetAllTextContents(document, "//opf:metadata/dc:creator", manager));
274-
metadata["language"] = GetFirstTextContent(document, "//opf:metadata/dc:language", manager);
275-
metadata["publisher"] = GetFirstTextContent(document, "//opf:metadata/dc:publisher", manager);
276-
metadata["date"] = GetFirstTextContent(document, "//opf:metadata/dc:date", manager);
277-
metadata["description"] = GetFirstTextContent(document, "//opf:metadata/dc:description", manager);
278-
metadata["identifier"] = GetFirstTextContent(document, "//opf:metadata/dc:identifier", manager);
272+
metadata[MetadataKeys.EpubTitle] = GetFirstTextContent(document, "//opf:metadata/dc:title", manager);
273+
metadata[MetadataKeys.EpubAuthor] = string.Join(", ", GetAllTextContents(document, "//opf:metadata/dc:creator", manager));
274+
metadata[MetadataKeys.EpubLanguage] = GetFirstTextContent(document, "//opf:metadata/dc:language", manager);
275+
metadata[MetadataKeys.EpubPublisher] = GetFirstTextContent(document, "//opf:metadata/dc:publisher", manager);
276+
metadata[MetadataKeys.EpubDate] = GetFirstTextContent(document, "//opf:metadata/dc:date", manager);
277+
metadata[MetadataKeys.EpubDescription] = GetFirstTextContent(document, "//opf:metadata/dc:description", manager);
278+
metadata[MetadataKeys.EpubIdentifier] = GetFirstTextContent(document, "//opf:metadata/dc:identifier", manager);
279279

280280
return metadata;
281281
}
@@ -461,7 +461,7 @@ private static EpubExtractionResult BuildExtraction(IReadOnlyList<SectionContent
461461
var metadata = new Dictionary<string, string>
462462
{
463463
[MetadataKeys.Page] = pageNumber.ToString(CultureInfo.InvariantCulture),
464-
["epub.section"] = section.Label
464+
[MetadataKeys.EpubSection] = section.Label
465465
};
466466

467467
var segment = new DocumentSegment(

0 commit comments

Comments
 (0)