How to make large PDF document searchable using parallel processing in VB.NET with ByteScout Robotic Process Automation

What is ByteScout Robotic Process Automation? It is components and tools for quick RPA (Robotic Process Automation) implementation.

On-demand (REST Web API) version:
 Web API (on-demand version)

On-premise offline SDK for Windows:
 60 Day Free Trial (on-premise)

Module1.vb

      
Imports System.IO Imports System.Threading Imports Bytescout.PDFExtractor Module Module1 ' Limit to 4 threads in queue. ' Set this value to number of your processor cores for max performance. Dim ThreadLimiter as Semaphore = New Semaphore(4, 4) <MTAThread> Sub Main() Const inputFile = "sample.pdf" Const resultFile = "result.pdf" Dim pageCount As Integer ' Get document page count Using infoExtractor = New InfoExtractor() infoExtractor.LoadDocumentFromFile(inputFile) pageCount = infoExtractor.GetPageCount() End Using ' Process the document by 10-page pieces Dim numberOfThreads As Integer numberOfThreads = pageCount \ 10 If (pageCount - numberOfThreads * 10 > 0) Then numberOfThreads = numberOfThreads + 1 Dim doneEvents(numberOfThreads - 1) As WaitHandle Dim stopwatch As Stopwatch = Stopwatch.StartNew() Dim startPage, endPage As Integer Dim pieces(numberOfThreads - 1) As String ' Run threads For i As Integer = 0 To numberOfThreads - 1 ' Wait for the queue ThreadLimiter.WaitOne() doneEvents(i) = New ManualResetEvent(False) startPage = i * 10 endPage = Math.Min(pageCount - 1, (i + 1) * 10 - 1) If numberOfThreads = 1 Then endPage = endPage - 1 End If pieces(i) = String.Format("temp-{0}-{1}.pdf", startPage, endPage) ThreadPool.QueueUserWorkItem(New WaitCallback(AddressOf ThreadProc), New Object() {i, doneEvents(i), inputFile, pieces(i), startPage, endPage}) Next ' Wait for all threads WaitHandle.WaitAll(doneEvents) ' Merge pieces Using merger = New DocumentMerger merger.Merge(pieces, resultFile) End Using ' Delete temp files For Each tempFile As String In pieces File.Delete(tempFile) Next Console.WriteLine("All done in {0}.", stopwatch.Elapsed) Console.WriteLine() Console.WriteLine("Press any key to exit...") Console.ReadKey() End Sub Sub ThreadProc(ByVal stateInfo As Object) Dim threadIndex As Integer = stateInfo(0) Dim waitEvent As ManualResetEvent = stateInfo(1) Dim inputFile As String = stateInfo(2) Dim outputFile As String = stateInfo(3) Dim startPage As Integer = stateInfo(4) Dim endPage As Integer = stateInfo(5) Try Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage) Dim stopwatch As Stopwatch = Stopwatch.StartNew() ' Extract a piece of document Dim chunk As String = String.Format("temp-{0}-{1}", startPage, endPage) Using splitter = New DocumentSplitter splitter.ExtractPageRange(inputFile, chunk, startPage + 1, endPage + 1) End Using ' Process the piece Using searchablePdfMaker As New SearchablePDFMaker("demo", "demo") searchablePdfMaker.OCRDetectPageRotation = True searchablePdfMaker.OCRLanguageDataFolder = "C:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata" searchablePdfMaker.LoadDocumentFromFile(chunk) ' 300 DPI resolution is recommended. ' Using of higher values will slow down the processing but does not guarantee the higher quality. searchablePdfMaker.OCRResolution = 300 searchablePdfMaker.MakePDFSearchable(outputFile) End Using File.Delete(chunk) Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed) Finally ' Signal the thread is finished waitEvent.Set() ' Release semaphore ThreadLimiter.Release() End Try End Sub End Module

MultithreadProcessing.sln

      
Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 15 VisualStudioVersion = 15.0.26730.8 MinimumVisualStudioVersion = 10.0.40219.1 Project("{F184B08F-C81C-45F6-A57F-5ABD9991F28F}") = "MultithreadProcessing", "MultithreadProcessing.vbproj", "{F18881BC-38FA-4D69-AD4E-4DC6CB637251}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Release|Any CPU = Release|Any CPU EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {F18881BC-38FA-4D69-AD4E-4DC6CB637251}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {F18881BC-38FA-4D69-AD4E-4DC6CB637251}.Debug|Any CPU.Build.0 = Debug|Any CPU {F18881BC-38FA-4D69-AD4E-4DC6CB637251}.Release|Any CPU.ActiveCfg = Release|Any CPU {F18881BC-38FA-4D69-AD4E-4DC6CB637251}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {BADFE9BE-5CD2-45AC-9023-A1BB794B42E4} EndGlobalSection EndGlobal

MultithreadProcessing.vbproj

      
<?xml version="1.0" encoding="utf-8"?> <Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" /> <PropertyGroup> <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> <ProjectGuid>{F18881BC-38FA-4D69-AD4E-4DC6CB637251}</ProjectGuid> <OutputType>Exe</OutputType> <StartupObject>MultithreadProcessing.Module1</StartupObject> <RootNamespace>MultithreadProcessing</RootNamespace> <AssemblyName>MultithreadProcessing</AssemblyName> <FileAlignment>512</FileAlignment> <MyType>Console</MyType> <TargetFrameworkVersion>v4.0</TargetFrameworkVersion> </PropertyGroup> <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> <PlatformTarget>AnyCPU</PlatformTarget> <DebugSymbols>true</DebugSymbols> <DebugType>full</DebugType> <DefineDebug>true</DefineDebug> <DefineTrace>true</DefineTrace> <OutputPath>bin\Debug\</OutputPath> <DocumentationFile>MultithreadProcessing.xml</DocumentationFile> <NoWarn>42016,41999,42017,42018,42019,42032,42036,42020,42021,42022</NoWarn> </PropertyGroup> <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> <PlatformTarget>AnyCPU</PlatformTarget> <DebugType>pdbonly</DebugType> <DefineDebug>false</DefineDebug> <DefineTrace>true</DefineTrace> <Optimize>true</Optimize> <OutputPath>bin\Release\</OutputPath> <DocumentationFile>MultithreadProcessing.xml</DocumentationFile> <NoWarn>42016,41999,42017,42018,42019,42032,42036,42020,42021,42022</NoWarn> </PropertyGroup> <PropertyGroup> <OptionExplicit>On</OptionExplicit> </PropertyGroup> <PropertyGroup> <OptionCompare>Binary</OptionCompare> </PropertyGroup> <PropertyGroup> <OptionStrict>Off</OptionStrict> </PropertyGroup> <PropertyGroup> <OptionInfer>On</OptionInfer> </PropertyGroup> <PropertyGroup> <ApplicationManifest>My Project\app.manifest</ApplicationManifest> </PropertyGroup> <ItemGroup> <Reference Include="Bytescout.PDFExtractor, Version=8.2.0.2710, Culture=neutral, PublicKeyToken=f7dd1bd9d40a50eb, processorArchitecture=MSIL"> <SpecificVersion>False</SpecificVersion> <HintPath>S:\Bytescout\PDF Extractor SDK\Bytescout.PDFExtractor\bin\Debug_FULL_4.0\Bytescout.PDFExtractor.dll</HintPath> </Reference> <Reference Include="Bytescout.PDFExtractor.OCRExtension, Version=8.2.0.2710, Culture=neutral, PublicKeyToken=f7dd1bd9d40a50eb, processorArchitecture=MSIL"> <SpecificVersion>False</SpecificVersion> <HintPath>S:\Bytescout\PDF Extractor SDK\Bytescout.PDFExtractor\bin\Debug_FULL_4.0\Bytescout.PDFExtractor.OCRExtension.dll</HintPath> </Reference> <Reference Include="System" /> <Reference Include="System.Data" /> <Reference Include="System.Deployment" /> <Reference Include="System.Xml" /> <Reference Include="System.Core" /> <Reference Include="System.Xml.Linq" /> <Reference Include="System.Data.DataSetExtensions" /> </ItemGroup> <ItemGroup> <Import Include="Microsoft.VisualBasic" /> <Import Include="System" /> <Import Include="System.Collections" /> <Import Include="System.Collections.Generic" /> <Import Include="System.Data" /> <Import Include="System.Diagnostics" /> <Import Include="System.Linq" /> <Import Include="System.Xml.Linq" /> </ItemGroup> <ItemGroup> <Compile Include="Module1.vb" /> <Compile Include="My Project\AssemblyInfo.vb" /> <Compile Include="My Project\Application.Designer.vb"> <AutoGen>True</AutoGen> <DependentUpon>Application.myapp</DependentUpon> </Compile> <Compile Include="My Project\Resources.Designer.vb"> <AutoGen>True</AutoGen> <DesignTime>True</DesignTime> <DependentUpon>Resources.resx</DependentUpon> </Compile> <Compile Include="My Project\Settings.Designer.vb"> <AutoGen>True</AutoGen> <DependentUpon>Settings.settings</DependentUpon> <DesignTimeSharedInput>True</DesignTimeSharedInput> </Compile> </ItemGroup> <ItemGroup> <EmbeddedResource Include="My Project\Resources.resx"> <Generator>VbMyResourcesResXFileCodeGenerator</Generator> <LastGenOutput>Resources.Designer.vb</LastGenOutput> <CustomToolNamespace>My.Resources</CustomToolNamespace> <SubType>Designer</SubType> </EmbeddedResource> </ItemGroup> <ItemGroup> <None Include="My Project\app.manifest" /> <None Include="My Project\Application.myapp"> <Generator>MyApplicationCodeGenerator</Generator> <LastGenOutput>Application.Designer.vb</LastGenOutput> </None> <None Include="My Project\Settings.settings"> <Generator>SettingsSingleFileGenerator</Generator> <CustomToolNamespace>My</CustomToolNamespace> <LastGenOutput>Settings.Designer.vb</LastGenOutput> </None> <Content Include="sample.pdf"> <CopyToOutputDirectory>Always</CopyToOutputDirectory> </Content> </ItemGroup> <Import Project="$(MSBuildToolsPath)\Microsoft.VisualBasic.targets" /> <!-- To modify your build process, add your task inside one of the targets below and uncomment it. Other similar extension points exist, see Microsoft.Common.targets. <Target Name="BeforeBuild"> </Target> <Target Name="AfterBuild"> </Target> --> </Project>

VIDEO

ON-PREMISE OFFLINE SDK

Get 60 Day Free Trial

See also:

ON-DEMAND REST WEB API

Get Your API Key

See also:

Related Samples: