diff --git a/OpticalCharacterRecognition/OCRDocument/App.config b/OpticalCharacterRecognition/OCRDocument/App.config new file mode 100644 index 0000000..aad6a5e --- /dev/null +++ b/OpticalCharacterRecognition/OCRDocument/App.config @@ -0,0 +1,6 @@ + + + + + + diff --git a/OpticalCharacterRecognition/OCRDocument/OCRDocument.cs b/OpticalCharacterRecognition/OCRDocument/OCRDocument.cs new file mode 100644 index 0000000..f80b4d0 --- /dev/null +++ b/OpticalCharacterRecognition/OCRDocument/OCRDocument.cs @@ -0,0 +1,74 @@ +using System; +using System.Collections.Generic; +using Datalogics.PDFL; + +/* + * Runs OCR on the document recognizing text found on its rasterized pages. + * + * Copyright (c) 2007-2025, Datalogics, Inc. All rights reserved. + * + */ + +namespace OCRDocument +{ + class OCRDocument + { + static void Main(string[] args) + { + Console.WriteLine("OCRDocument Sample:"); + + using (Library lib = new Library()) + { + Console.WriteLine("Initialized the library."); + + String sInput = Library.ResourceDirectory + "Sample_Input/scanned_images.pdf"; + String sOutput = "OCRDocument-out.pdf"; + + if (args.Length > 0) + sInput = args[0]; + if (args.Length > 1) + sOutput = args[1]; + + Console.WriteLine("Input file: " + sInput); + Console.WriteLine("Writing output to: " + sOutput); + + OCRParams ocrParams = new OCRParams(); + //The OCRParams.Languages parameter controls which languages the OCR engine attempts + //to detect. By default the OCR engine searches for English. + List langList = new List(); + LanguageSetting languageOne = new LanguageSetting(Language.English, false); + langList.Add(languageOne); + + //You could add additional languages for the OCR engine to detect by adding + //more entries to the LanguageSetting list. + + //LanguageSetting languageTwo = new LanguageSetting(Language.Japanese, false); + //langList.Add(languageTwo); + ocrParams.Languages = langList; + + // If the resolution for the images in your document are not + // 300 dpi, specify a default resolution here. Specifying a + // correct resolution gives better results for OCR, especially + // with automatic image preprocessing. + // ocrParams.Resolution = 600; + + using (OCREngine ocrEngine = new OCREngine(ocrParams)) + { + //Create a document object using the input file + using (Document doc = new Document(sInput)) + { + for (int numPage = 0; numPage < doc.NumPages; numPage++) + { + using (Page page = doc.GetPage(numPage)) + { + page.RecognizePageContents(doc, ocrEngine); + } + } + + doc.Save(SaveFlags.Full, sOutput); + } + } + } + } + } +} diff --git a/OpticalCharacterRecognition/OCRDocument/OCRDocument.csproj b/OpticalCharacterRecognition/OCRDocument/OCRDocument.csproj new file mode 100644 index 0000000..a920323 --- /dev/null +++ b/OpticalCharacterRecognition/OCRDocument/OCRDocument.csproj @@ -0,0 +1,58 @@ + + + + + Debug + x64 + {C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF} + Exe + OCRDocument + OCRDocument + v4.7.2 + 512 + true + true + + + + + x64 + true + full + false + ..\..\..\dle\build\win-x86-64\Debug\ + DEBUG;TRACE + prompt + 4 + + + x64 + pdbonly + false + bin\Release\ + TRACE + prompt + 4 + + + + 18.* + + + + + + + + + + + + + + + + + + + diff --git a/OpticalCharacterRecognition/OCRDocument/OCRDocument.sln b/OpticalCharacterRecognition/OCRDocument/OCRDocument.sln new file mode 100644 index 0000000..57e2fb8 --- /dev/null +++ b/OpticalCharacterRecognition/OCRDocument/OCRDocument.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.33328.57 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OCRDocument", "OCRDocument.csproj", "{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Debug|x64.ActiveCfg = Debug|x64 + {C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Debug|x64.Build.0 = Debug|x64 + {C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Release|x64.ActiveCfg = Release|x64 + {C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {CEA60573-4A7F-49A3-8EC5-6DCC54E2E30B} + EndGlobalSection +EndGlobal diff --git a/OpticalCharacterRecognition/OCRDocument/Properties/AssemblyInfo.cs b/OpticalCharacterRecognition/OCRDocument/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..8c8be40 --- /dev/null +++ b/OpticalCharacterRecognition/OCRDocument/Properties/AssemblyInfo.cs @@ -0,0 +1,33 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("OCRDocument")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("Datalogics, Inc.")] +[assembly: AssemblyProduct("OCRDocument")] +[assembly: AssemblyCopyright("Copyright © Datalogics 2019-2025")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("a1a2f184-6250-4843-8d6b-3a72776dd27d")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/OpticalCharacterRecognition/README.md b/OpticalCharacterRecognition/README.md index 8bb70df..5998150 100644 --- a/OpticalCharacterRecognition/README.md +++ b/OpticalCharacterRecognition/README.md @@ -3,3 +3,6 @@ Places recognized text behind the OCR images found on a PDF page. ## ***AddTextToImage*** Adds an image file to a PDF page, runs OCR on the image, and place the recognized text behind it. + +## ***OCRDocument*** +Runs OCR on the document recognizing text found on its rasterized pages.