How to parse simple document for document parser API in C# and PDF.co Web API

PDF.co Web API: the flexible Web API that includes full set of functions from e-signature requests to data extraction, OCR, images recognition, pdf splitting and pdf splitting. Can also generate barcodes and read barcodes from images, scans and pdf.

On-demand (REST Web API) version:
 Web API (on-demand version)

On-premise offline SDK for Windows:
 60 Day Free Trial (on-premise)

AmazonAWS.yml

      
--- templateVersion: 3 templatePriority: 0 sourceId: Amazon Web Services Invoice detectionRules: keywords: - Amazon Web Services - ATTN - Invoice fields: total: type: macros expression: 'TOTAL AMOUNT DUE ON{{Anything}}{{Dollar}}({{Number}})' dataType: decimal subTotal: type: macros expression: '{{LineStart}}{{Spaces}}Charges{{Spaces}}{{Dollar}}({{Number}})' dataType: decimal dateIssued: type: macros expression: 'Invoice Date:{{Spaces}}({{Anything}}){{LineEnd}}' dataType: date dateFormat: MMMM d , yyyy invoiceId: type: macros expression: 'Invoice Number:{{Spaces}}({{Digits}})' companyName: type: static expression: Amazon Web Services, Inc. companyWebsite: type: static expression: aws.amazon.com billTo: type: rectangle expression: 'Bill to Address:{{ToggleSingleLineMode}}({{AnythingGreedy}})' rectangle: - 33 - 115.5 - 213.75 - 72.75 pageIndex: 0 currency: type: static expression: USD tables: - name: table1 start: expression: '{{LineStart}}{{Spaces}}Detail{{LineEnd}}' end: expression: '{{EndOfPage}}' row: expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}}){{LineEnd}}' columns: - name: unitPrice type: decimal

ByteScoutWebApiExample.csproj

      
<?xml version="1.0" encoding="utf-8"?> <Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" /> <PropertyGroup> <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> <ProjectGuid>{1E1C2C34-017E-4605-AE2B-55EA3313BE51}</ProjectGuid> <OutputType>Exe</OutputType> <RootNamespace>ByteScoutWebApiExample</RootNamespace> <AssemblyName>ByteScoutWebApiExample</AssemblyName> <TargetFrameworkVersion>v4.0</TargetFrameworkVersion> <FileAlignment>512</FileAlignment> </PropertyGroup> <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> <PlatformTarget>AnyCPU</PlatformTarget> <DebugSymbols>true</DebugSymbols> <DebugType>full</DebugType> <Optimize>false</Optimize> <OutputPath>bin\Debug\</OutputPath> <DefineConstants>DEBUG;TRACE</DefineConstants> <ErrorReport>prompt</ErrorReport> <WarningLevel>4</WarningLevel> </PropertyGroup> <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> <PlatformTarget>AnyCPU</PlatformTarget> <DebugType>pdbonly</DebugType> <Optimize>true</Optimize> <OutputPath>bin\Release\</OutputPath> <DefineConstants>TRACE</DefineConstants> <ErrorReport>prompt</ErrorReport> <WarningLevel>4</WarningLevel> </PropertyGroup> <ItemGroup> <Reference Include="Newtonsoft.Json, Version=10.0.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL"> <HintPath>packages\Newtonsoft.Json.10.0.3\lib\net40\Newtonsoft.Json.dll</HintPath> <Private>True</Private> </Reference> <Reference Include="System" /> <Reference Include="System.Core" /> <Reference Include="System.Xml.Linq" /> <Reference Include="System.Data" /> <Reference Include="System.Xml" /> </ItemGroup> <ItemGroup> <Compile Include="Program.cs" /> </ItemGroup> <ItemGroup> <None Include="AmazonAWS.yml"> <CopyToOutputDirectory>Always</CopyToOutputDirectory> </None> <None Include="DigitalOcean.yml"> <CopyToOutputDirectory>Always</CopyToOutputDirectory> </None> <None Include="Google.yml"> <CopyToOutputDirectory>Always</CopyToOutputDirectory> </None> <None Include="AmazonAWS.pdf"> <CopyToOutputDirectory>Always</CopyToOutputDirectory> </None> <None Include="DigitalOcean.pdf"> <CopyToOutputDirectory>Always</CopyToOutputDirectory> </None> <None Include="Google.pdf"> <CopyToOutputDirectory>Always</CopyToOutputDirectory> </None> <None Include="packages.config" /> </ItemGroup> <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> <!-- To modify your build process, add your task inside one of the targets below and uncomment it. Other similar extension points exist, see Microsoft.Common.targets. <Target Name="BeforeBuild"> </Target> <Target Name="AfterBuild"> </Target> --> </Project>

ByteScoutWebApiExample.sln

      
Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 2013 VisualStudioVersion = 12.0.40629.0 MinimumVisualStudioVersion = 10.0.40219.1 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ByteScoutWebApiExample", "ByteScoutWebApiExample.csproj", "{1E1C2C34-017E-4605-AE2B-55EA3313BE51}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Release|Any CPU = Release|Any CPU EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {1E1C2C34-017E-4605-AE2B-55EA3313BE51}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {1E1C2C34-017E-4605-AE2B-55EA3313BE51}.Debug|Any CPU.Build.0 = Debug|Any CPU {1E1C2C34-017E-4605-AE2B-55EA3313BE51}.Release|Any CPU.ActiveCfg = Release|Any CPU {1E1C2C34-017E-4605-AE2B-55EA3313BE51}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection EndGlobal

DigitalOcean.yml

      
--- templateVersion: 3 templatePriority: 0 sourceId: DigitalOcean Invoice detectionRules: keywords: # Template will match documents containing the following phrases: - DigitalOcean - 101 Avenue of the Americas - Invoice Number fields: # Static field that will "DigitalOcean" to the result companyName: type: static expression: DigitalOcean # Macro field that will find the text "Invoice Number: 1234567" and return "1234567" to the result invoiceId: type: macros expression: 'Invoice Number: ({{Digits}})' # Macro field that will find the text "Date Issued: February 1, 2016" and return the date "February 1, 2016" in ISO format to the result dateIssued: type: macros expression: 'Date Issued: ({{SmartDate}})' dataType: date dateFormat: auto-mdy # Macro field that will find the text "Total:

{codeFileName}

      
{code}


10.00" and return "110.00" to the result
total:
type: macros
expression: 'Total: {{Dollar}}({{Number}})'
dataType: decimal
# Static field that will "USD" to the result
currency:
type: static
expression: USD
tables:
- name: table1
# The table will start after the text "Description Hours"
start:
expression: 'Description{{Spaces}}Hours'
# The table will end before the text "Total:"
end:
expression: 'Total:'
# Macro expression that will find table rows "Website-Dev (1GB) 744 01-01 00:00 01-31 23:59

{codeFileName}

      
{code}


0.00", etc.
row:
# Groups <description>, <hours>, <start>, <end> and <unitPrice> will become columns in the result table.
expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<hours>{{Digits}}){{Spaces}}(?<start>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}(?<end>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}})'
# Suggest data types for table columns (missing columns will have the default "string" type):
columns:
- name: hours
type: integer
- name: unitPrice
type: decimal

Google.yml

      
--- templateVersion: 3 templatePriority: 0 sourceId: Google Invoice detectionRules: keywords: - Google - 77-0493581 - Invoice fields: invoiceId: expression: 'Invoice number:{{Spaces}}({{Digits}})' dateIssued: expression: 'Issue date:{{Spaces}}({{SmartDate}})' dataType: date dateFormat: MMM d, yyyy total: expression: 'Amount due in USD:{{Spaces}}{{Number}}' dataType: decimal subTotal: expression: 'Subtotal in USD:{{Spaces}}{{Number}}' dataType: decimal taxRate: expression: 'State sales tax {{OpeningParenthesis}}{{Digits}}{{Percent}}{{ClosingParenthesis}}' dataType: integer tax: expression: 'State sales tax{{Anything}}{{Number}}{{LineEnd}}' dataType: decimal companyName: type: static expression: 'Google LLC' billTo: type: rectangle rectangle: - 0 - 152 - 280 - 72 pageIndex: 0 billingId: expression: 'Billing ID:{{Spaces}}({{DigitsOrSymbols}})' currency: type: static expression: 'USD' tables: - name: table1 start: expression: 'Description{{Spaces}}Interval{{Spaces}}Quantity{{Spaces}}Amount' end: expression: 'Subtotal in USD' row: expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<interval>{{3Letters}}{{Space}}{{Digits}}{{Space}}{{Minus}}{{Space}}{{3Letters}}{{Space}}{{Digits}}){{Spaces}}(?<quantity>{{Digits}}){{Spaces}}(?<amount>{{Number}})' columns: - name: quantity type: integer - name: amount type: decimal

Program.cs

      
using Newtonsoft.Json; using Newtonsoft.Json.Linq; using System; using System.Collections.Generic; using System.IO; using System.Net; using System.Threading; // Cloud API asynchronous "Document Parser" job example. // Allows to avoid timeout errors when processing huge or scanned PDF documents. namespace ByteScoutWebApiExample { class Program { // The authentication key (API Key). // Get your own by registering at https://app.pdf.co/documentation/api const String API_KEY = "***********************************"; // Source PDF file const string SourceFile = @".\AmazonAWS.pdf"; //const string SourceFile = @".\DigitalOcean.pdf"; //const string SourceFile = @".\Google.pdf"; // PDF document password. Leave empty for unprotected documents. const string Password = ""; // Destination TXT file name const string DestinationFile = @".\result.json"; // (!) Make asynchronous job const bool Async = true; static void Main(string[] args) { // Template text. Use Document Parser SDK (https://bytescout.com/products/developer/documentparsersdk/index.html) // to create templates. // Read template from file: String templateText = File.ReadAllText(@".\AmazonAWS.yml"); //String templateText = File.ReadAllText(@".\DigitalOcean.yml"); //String templateText = File.ReadAllText(@".\Google.yml"); // Create standard .NET web client instance WebClient webClient = new WebClient(); // Set API Key webClient.Headers.Add("x-api-key", API_KEY); // 1. RETRIEVE THE PRESIGNED URL TO UPLOAD THE FILE. // * If you already have a direct file URL, skip to the step 3. // Prepare URL for `Get Presigned URL` API call string query = Uri.EscapeUriString(string.Format( "https://api.pdf.co/v1/file/upload/get-presigned-url?contenttype=application/octet-stream&name={0}", Path.GetFileName(SourceFile))); try { // Execute request string response = webClient.DownloadString(query); // Parse JSON response JObject json = JObject.Parse(response); if (json["error"].ToObject<bool>() == false) { // Get URL to use for the file upload string uploadUrl = json["presignedUrl"].ToString(); string uploadedFileUrl = json["url"].ToString(); // 2. UPLOAD THE FILE TO CLOUD. webClient.Headers.Add("content-type", "application/octet-stream"); webClient.UploadFile(uploadUrl, "PUT", SourceFile); // You can use UploadData() instead if your file is byte[] or Stream webClient.Headers.Remove("content-type"); // 3. PARSE UPLOADED PDF DOCUMENT // URL for `Document Parser` API call query = Uri.EscapeUriString(string.Format( "https://api.pdf.co/v1/pdf/documentparser?url={0}&async={1}", uploadedFileUrl, Async)); Dictionary<string, string> requestBody = new Dictionary<string, string>(); requestBody.Add("template", templateText); // Execute request response = webClient.UploadString(query, "POST", JsonConvert.SerializeObject(requestBody)); // Parse JSON response json = JObject.Parse(response); if (json["error"].ToObject<bool>() == false) { // Asynchronous job ID string jobId = json["jobId"].ToString(); // Get URL of generated JSON file string resultFileUrl = json["url"].ToString(); // Check the job status in a loop. // If you don't want to pause the main thread you can rework the code // to use a separate thread for the status checking and completion. do { string status = CheckJobStatus(webClient, jobId); // Possible statuses: "working", "failed", "aborted", "success". // Display timestamp and status (for demo purposes) Console.WriteLine(DateTime.Now.ToLongTimeString() + ": " + status); if (status == "success") { // Download JSON file webClient.DownloadFile(resultFileUrl, DestinationFile); Console.WriteLine("Generated JSON file saved as \"{0}\" file.", DestinationFile); break; } else if (status == "working") { // Pause for a few seconds Thread.Sleep(3000); } else { Console.WriteLine(status); break; } } while (true); } else { Console.WriteLine(json["message"].ToString()); } } else { Console.WriteLine(json["message"].ToString()); } } catch (WebException e) { Console.WriteLine(e.ToString()); } webClient.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key..."); Console.ReadKey(); } static string CheckJobStatus(WebClient webClient, string jobId) { string url = "https://api.pdf.co/v1/job/check?jobid=" + jobId; string response = webClient.DownloadString(url); JObject json = JObject.Parse(response); return Convert.ToString(json["status"]); } } }

packages.config

      
<?xml version="1.0" encoding="utf-8"?> <packages> <package id="Newtonsoft.Json" version="10.0.3" targetFramework="net40" /> </packages>

VIDEO

ON-PREMISE OFFLINE SDK

Get 60 Day Free Trial

See also:

ON-DEMAND REST WEB API

Get Your API Key

See also:

Related Samples: