Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .NET(v4.0)/ConvertHtmlToPdf/ConvertHtmlToPdf.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="itext7.bouncy-castle-adapter" Version="8.0.4" />
<PackageReference Include="itext7.pdfhtml" Version="5.0.4" />
</ItemGroup>

<Target Name="PostBuild" AfterTargets="PostBuildEvent">
<Exec Command="xcopy /Y &quot;$(ProjectDir)..\..\Data\other-doc-type\web-page.html&quot; &quot;$(TargetDir)HtmlFile\&quot;" />
</Target>

</Project>
32 changes: 32 additions & 0 deletions .NET(v4.0)/ConvertHtmlToPdf/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// coding: utf - 8
// --------------------------------------------------------------------------
// Copyright(c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See License.txt in the project root for
// license information.
// --------------------------------------------------------------------------
using System.Diagnostics;
using System.Text;
using iText.Html2pdf;

// function to convert Html string to Pdf document
Action<string, string> ConvertHtmlToPdf = (string htmlString, string outputFilePath) =>
{
using var htmlStream = new MemoryStream(Encoding.UTF8.GetBytes(htmlString));
using var pdfFileStream = new FileStream(outputFilePath, FileMode.OpenOrCreate, FileAccess.Write);
HtmlConverter.ConvertToPdf(htmlStream, pdfFileStream);
};

var baseDir = Environment.CurrentDirectory;
var htmlContent = File.ReadAllText($"{baseDir}\\HtmlFile\\web-page.html");
var pdfOutputFolder = $"{baseDir}\\Output";
if (!Directory.Exists(pdfOutputFolder))
{
Directory.CreateDirectory(pdfOutputFolder);
}

var pdfOutputPath = $"{pdfOutputFolder}\\converted.pdf";
ConvertHtmlToPdf(htmlContent, pdfOutputPath);

Console.WriteLine($"Pdf convert successfully in {pdfOutputPath}");
Process.Start(new ProcessStartInfo(pdfOutputPath) { UseShellExecute = true });

11 changes: 10 additions & 1 deletion .NET(v4.0)/sdk-samples.sln
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Sample Code Snippet For Doc
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CodeSnippetForApiVer_2024-02-29_Preview", "CodeSnippetForApiVer_2024-02-29_Preview\CodeSnippetForApiVer_2024-02-29_Preview.csproj", "{EFCE60E7-6544-443C-8C68-53BFC9447B51}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CodeSnippetForApiVer_2023-10-31_Preview", "CodeSnippetForApiVer_2023-10-31_Preview\CodeSnippetForApiVer_2023-10-31_Preview.csproj", "{BC2D5C4C-C70E-4FB1-AD1E-1505F97FF231}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CodeSnippetForApiVer_2023-10-31_Preview", "CodeSnippetForApiVer_2023-10-31_Preview\CodeSnippetForApiVer_2023-10-31_Preview.csproj", "{BC2D5C4C-C70E-4FB1-AD1E-1505F97FF231}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ConvertHtmlToPdf", "ConvertHtmlToPdf\ConvertHtmlToPdf.csproj", "{BD96BDFF-643C-446C-815B-A5E51FC48166}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "FileConversion", "FileConversion", "{EC23A1E8-D9A3-44E2-B63C-567C9145C165}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand All @@ -29,13 +33,18 @@ Global
{BC2D5C4C-C70E-4FB1-AD1E-1505F97FF231}.Debug|Any CPU.Build.0 = Debug|Any CPU
{BC2D5C4C-C70E-4FB1-AD1E-1505F97FF231}.Release|Any CPU.ActiveCfg = Release|Any CPU
{BC2D5C4C-C70E-4FB1-AD1E-1505F97FF231}.Release|Any CPU.Build.0 = Release|Any CPU
{BD96BDFF-643C-446C-815B-A5E51FC48166}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{BD96BDFF-643C-446C-815B-A5E51FC48166}.Debug|Any CPU.Build.0 = Debug|Any CPU
{BD96BDFF-643C-446C-815B-A5E51FC48166}.Release|Any CPU.ActiveCfg = Release|Any CPU
{BD96BDFF-643C-446C-815B-A5E51FC48166}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{EFCE60E7-6544-443C-8C68-53BFC9447B51} = {E9CBB1EB-6D19-4981-A5F2-EFEA3CF0F0A7}
{BC2D5C4C-C70E-4FB1-AD1E-1505F97FF231} = {E9CBB1EB-6D19-4981-A5F2-EFEA3CF0F0A7}
{BD96BDFF-643C-446C-815B-A5E51FC48166} = {EC23A1E8-D9A3-44E2-B63C-567C9145C165}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {5CBC567B-C775-4E10-BCB4-E019111881C4}
Expand Down
80 changes: 80 additions & 0 deletions Data/other-doc-type/web-page.html

Large diffs are not rendered by default.

83 changes: 83 additions & 0 deletions Doc/Convert Office File To Pdf.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Convert Office File To Pdf Format By Microsoft Graph API

## Overview
Microsoft Graph API Provide a good way to convert office file to pdf format. Let’s take a quick look at the functional core. The API of [Download a file in another format](https://learn.microsoft.com/en-us/graph/api/driveitem-get-content-format?view=graph-rest-1.0&tabs=http) support a lot of file type, briefly as below:

| Format value | Description | Supported source extensions |
| ------------ | ----------- | --------------------------- |
| pdf | Converts the item into PDF format. | csv, doc, docx, odp, ods, odt, pot, potm, potx, pps, ppsx, ppsxm, ppt, pptm, pptx, rtf, xls, xlsx |
| html | Converts the item into HTML format. | loop, fluid, wbtx |

It almost supports convert all the MS Office document formats to pdf with good quality. Before using this API, you need to understand the authentication and authorization concepts in the Microsoft identity platform.

## Prerequisites
- A Microsoft Entra ID tenant. If you don't have a tenant, create a [free Azure account to get free subscription](https://azure.microsoft.com/free/?WT.mc_id=A261C142F).
- An account that has at least the [Cloud Application Administrator](https://learn.microsoft.com/en-us/entra/identity/role-based-access-control/permissions-reference?toc=%2Fgraph%2Ftoc.json#cloud-application-administrator) role.
- The <a name="drive">drive</a> resource to storage file, it could be [OneDrive](https://www.microsoft.com/en-us/microsoft-365/onedrive/online-cloud-storage/), [OneDrive for business](https://www.microsoft.com/en-us/microsoft-365/onedrive/onedrive-for-business), or [Sharepoint](https://www.microsoft.com/en-us/microsoft-365/sharepoint/collaboration). For enterprise scenario, suggest [Sharepoint](https://www.microsoft.com/en-us/microsoft-365/sharepoint/collaboration).

## Process
There is a comprehensive documentation about [Microsoft Graph API](https://learn.microsoft.com/en-us/graph/overview). Due to Microsoft Graph API has a wide function range, it only summarizes the topics related to Microsoft Office file conversion, in this article.

### 1. Register the app in Microsoft Identity Platform
To call Microsoft Graph, an app must obtain an access token from the Microsoft identity platform. To register the app in Microsoft Identity Platform, reference the steps from https://learn.microsoft.com/en-us/graph/auth-register-app-v2.

### 2. Authentication and authorization basics
As previous step introduced, the app could obtain the access token. The access token includes information about whether the app is authorized to access Microsoft Graph on behalf of a signed-in user or with its own identity.
- Access scenarios introduce : https://learn.microsoft.com/en-us/graph/auth/auth-concepts#access-scenarios. For enterprise scenario, suggest **Get access without a user**
- Get access without a user : https://learn.microsoft.com/en-us/graph/auth-v2-service?tabs=http
+ Extension - OAuth 2.0 client credentials flow : https://learn.microsoft.com/en-us/entra/identity-platform/v2-oauth2-client-creds-grant-flow
+ Extension - OAuth 2.0 client credentials flow by SDK : https://learn.microsoft.com/en-us/graph/sdks/choose-authentication-providers?view=graph-rest-1.0#client-credentials-provider

### 3. Create temporary folder by Microsoft Graph API
After getting the access token, it could use the Microsoft Graph API now.
You'd better to create a temporary folder to store the Microsoft Office file by *[API: Create a new folder in a drive](https://learn.microsoft.com/en-us/graph/api/driveitem-post-children?view=graph-rest-1.0&tabs=http)*. For temporary folder, it's a file transfer station. Once the conversion is complete, this temporary folder and sub files can be deleted easily.


### 4. Upload the Microsoft Office file to drive by Microsoft Graph API
When the temporary folder is ready, could upload the Microsoft Office file to the temporary folder in <a href="#drive">drive</a> by *[API: Upload or replace the contents of a driveItem](https://learn.microsoft.com/en-us/graph/api/driveitem-put-content?view=graph-rest-1.0&tabs=http)*. Up to this step, you have made the full preparations for Microsoft Office file conversion.

### 5. Convert the Microsoft Office file to pdf format by Microsoft Graph API
Here's the key step, just calling the *[API: Download a file in another format](https://learn.microsoft.com/en-us/graph/api/driveitem-get-content-format?view=graph-rest-1.0&tabs=http)* to convert the Microsoft Office file to pdf format and save it to your local. <br>
Notice: the parameters of ***format*** should be ***pdf***

- #### Sample code for Python:
~~~
query_params = ContentRequestBuilder.ContentRequestBuilderGetQueryParameters(
format="pdf",
)
request_config = ContentRequestBuilder.ContentRequestBuilderGetRequestConfiguration(
query_parameters=query_params
)
pdf_bytes = (
await graph_client.drives.by_drive_id(user_drive_id)
.items.by_drive_item_id(file_item.id)
.content.get(request_config)
)

pdf_abspath = os.path.abspath(
os.path.join(
save_dir_path,
f"./{file_name}.pdf",
)
)
~~~

- #### Sample code for C#:
~~~
using var pdfStream = await graphClient.Drives[driveId].Items[fileItem.Id].Content.GetAsync((requestConfiguration) =>
{
requestConfiguration.QueryParameters.Format = "pdf";
});

if (pdfStream != null)
{
var savePdfFileName = $"{saveDirPath}\\{fileName}.pdf";
using FileStream saveStream = File.Create(savePdfFileName);
pdfStream.CopyTo(saveStream);
}
~~~

### 6. Delete temporary folder by Microsoft Graph API
After converting Microsoft Office file to pdf format successfully, it's better to delete the temporary folder which used to store Microsoft Office file to keep the drive clear. You could implement this behavior by ***[API:Delete a DriveItem](https://learn.microsoft.com/en-us/graph/api/driveitem-delete?view=graph-rest-1.0&tabs=http)***.

This is an introduction for complete process of converting Microsoft Office file to pdf format. You could integrate the above related APIs in your exist system to achieve this feature.
69 changes: 69 additions & 0 deletions Python(v4.0)/Others/sample_convert_html_to_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# coding: utf-8

# --------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------

"""
FILE: sample_convert_html_to_pdf.py

DESCRIPTION:
This sample demonstrates how to convert a html file to the pdf file.
PREREQUISITES:
Before using this function, need to install following required component:
1).Install python-pdfkit:
"$ pip install pdfkit"
2).Install wkhtmltopdf:
-Debian/Ubuntu:
"$ sudo apt-get install wkhtmltopdf"
-macOS:
"$ brew install homebrew/cask/wkhtmltopdf"
-Windows and other options: check https://wkhtmltopdf.org/downloads.html for wkhtmltopdf binary installers
More information about pdfkit, reference from https://pypi.org/project/pdfkit/.
USAGE:
python sample_convert_html_to_pdf.py
"""

import os
import pdfkit


def convert_html_file_to_pdf(html_path, save_pdf_path):
with open(html_path, "r", encoding="utf-8") as f:
htmlStr = f.read()

directory_to_save_pdf = os.path.dirname(save_pdf_path)
if not os.path.isdir(directory_to_save_pdf):
os.makedirs(directory_to_save_pdf)

return pdfkit.from_string(htmlStr, save_pdf_path)


if __name__ == "__main__":
current_file_path = os.path.abspath(__file__)
current_folder_path = os.path.dirname(current_file_path)

path_of_sample_html = os.path.abspath(
os.path.join(
current_file_path,
"..",
"..",
"..",
"./Data/other-doc-type/web-page.html",
)
)

path_to_save_pdf = os.path.abspath(
os.path.join(
current_folder_path,
"./result/converted.pdf",
)
)

isSuccessful = convert_html_file_to_pdf(path_of_sample_html, path_to_save_pdf)
if isSuccessful:
print(f"Convert pdf successfully in {path_to_save_pdf}")
else:
print("Something wrong. Check the html file please.")