How to parse simple document for document parser API in PHP and PDF.co Web API

PDF.co Web API is the Rest API that provides set of data extraction functions, tools for documents manipulation, splitting and merging of pdf files. Includes built-in OCR, images recognition, can generate and read barcodes from images, scans and pdf.

On-demand (REST Web API) version:
 Web API (on-demand version)

On-premise offline SDK for Windows:
 60 Day Free Trial (on-premise)

AmazonAWS.yml

      
templateName: Amazon Web Services Invoice templateVersion: 4 templatePriority: 0 detectionRules: keywords: - Amazon Web Services - ATTN - Invoice objects: - name: total objectType: field fieldProperties: fieldType: macros expression: TOTAL AMOUNT DUE ON{{Anything}}{{Dollar}}({{Number}}) regex: true dataType: decimal - name: subTotal objectType: field fieldProperties: fieldType: macros expression: '{{LineStart}}{{Spaces}}Charges{{Spaces}}{{Dollar}}({{Number}})' regex: true dataType: decimal - name: dateIssued objectType: field fieldProperties: fieldType: macros expression: Invoice Date:{{Spaces}}({{Anything}}){{LineEnd}} regex: true dataType: date dateFormat: MMMM d , yyyy - name: invoiceId objectType: field fieldProperties: fieldType: macros expression: Invoice Number:{{Spaces}}({{Digits}}) regex: true - name: companyName objectType: field fieldProperties: fieldType: static expression: Amazon Web Services, Inc. regex: true - name: companyWebsite objectType: field fieldProperties: fieldType: static expression: aws.amazon.com regex: true - name: billTo objectType: field fieldProperties: fieldType: rectangle expression: Bill to Address:{{ToggleSingleLineMode}}({{AnythingGreedy}}) regex: true rectangle: - 33 - 115.5 - 213.75 - 72.75 pageIndex: 0 - name: currency objectType: field fieldProperties: fieldType: static expression: USD regex: true - name: table1 objectType: table tableProperties: start: expression: '{{LineStart}}{{Spaces}}Detail{{LineEnd}}' regex: true end: expression: '{{EndOfPage}}' regex: true row: expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}}){{LineEnd}}' regex: true columns: - name: unitPrice dataType: decimal

DigitalOcean.yml

      
templateName: DigitalOcean Invoice templateVersion: 4 templatePriority: 0 detectionRules: keywords: - DigitalOcean - 101 Avenue of the Americas - Invoice Number objects: - name: companyName objectType: field fieldProperties: fieldType: static expression: DigitalOcean regex: true - name: invoiceId objectType: field fieldProperties: fieldType: macros expression: 'Invoice Number: ({{Digits}})' regex: true - name: dateIssued objectType: field fieldProperties: fieldType: macros expression: 'Date Issued: ({{SmartDate}})' regex: true dataType: date dateFormat: auto-mdy - name: total objectType: field fieldProperties: fieldType: macros expression: 'Total: {{Dollar}}({{Number}})' regex: true dataType: decimal - name: currency objectType: field fieldProperties: fieldType: static expression: USD regex: true - name: table1 objectType: table tableProperties: start: expression: Description{{Spaces}}Hours regex: true end: expression: 'Total:' regex: true row: expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<hours>{{Digits}}){{Spaces}}(?<start>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}(?<end>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}})' regex: true columns: - name: hours dataType: integer - name: unitPrice dataType: decimal

Google.yml

      
templateName: Google Invoice templateVersion: 4 templatePriority: 0 detectionRules: keywords: - Google - 77-0493581 - Invoice objects: - name: invoiceId objectType: field fieldProperties: expression: Invoice number:{{Spaces}}({{Digits}}) regex: true - name: dateIssued objectType: field fieldProperties: expression: Issue date:{{Spaces}}({{SmartDate}}) regex: true dataType: date dateFormat: MMM d, yyyy - name: total objectType: field fieldProperties: expression: Amount due in USD:{{Spaces}}{{Number}} regex: true dataType: decimal - name: subTotal objectType: field fieldProperties: expression: Subtotal in USD:{{Spaces}}{{Number}} regex: true dataType: decimal - name: taxRate objectType: field fieldProperties: expression: State sales tax {{OpeningParenthesis}}{{Digits}}{{Percent}}{{ClosingParenthesis}} regex: true dataType: integer - name: tax objectType: field fieldProperties: expression: State sales tax{{Anything}}{{Number}}{{LineEnd}} regex: true dataType: decimal - name: companyName objectType: field fieldProperties: fieldType: static expression: Google LLC regex: true - name: billTo objectType: field fieldProperties: fieldType: rectangle regex: true rectangle: - 0 - 152 - 280 - 72 pageIndex: 0 - name: billingId objectType: field fieldProperties: expression: Billing ID:{{Spaces}}({{DigitsOrSymbols}}) regex: true - name: currency objectType: field fieldProperties: fieldType: static expression: USD regex: true - name: table1 objectType: table tableProperties: start: expression: Description{{Spaces}}Interval{{Spaces}}Quantity{{Spaces}}Amount regex: true end: expression: Subtotal in USD regex: true row: expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<interval>{{3Letters}}{{Space}}{{Digits}}{{Space}}{{Minus}}{{Space}}{{3Letters}}{{Space}}{{Digits}}){{Spaces}}(?<quantity>{{Digits}}){{Spaces}}(?<amount>{{Number}})' regex: true columns: - name: quantity dataType: integer - name: amount dataType: decimal

program.php

      
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Document Parse Results</title> </head> <body> <?php // Get submitted form data $apiKey = $_POST["apiKey"]; // The authentication key (API Key). Get your own by registering at https://app.pdf.co/documentation/api // 1. RETRIEVE THE PRESIGNED URL TO UPLOAD THE FILE. // * If you already have the direct PDF file link, go to the step 3. // Create URL $url = "https://api.pdf.co/v1/file/upload/get-presigned-url" . "?name=" . $_FILES["file"]["tmp_name"] . "&contenttype=application/octet-stream"; // Create request $curl = curl_init(); curl_setopt($curl, CURLOPT_HTTPHEADER, array("x-api-key: " . $apiKey)); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); // Execute request $result = curl_exec($curl); if (curl_errno($curl) == 0) { $status_code = curl_getinfo($curl, CURLINFO_HTTP_CODE); if ($status_code == 200) { $json = json_decode($result, true); // Get URL to use for the file upload $uploadFileUrl = $json["presignedUrl"]; // Get URL of uploaded file to use with later API calls $uploadedFileUrl = $json["url"]; // 2. UPLOAD THE FILE TO CLOUD. $localFile = $_FILES["fileInput"]["tmp_name"]; $fileHandle = fopen($localFile, "r"); curl_setopt($curl, CURLOPT_URL, $uploadFileUrl); curl_setopt($curl, CURLOPT_HTTPHEADER, array("content-type: application/octet-stream")); curl_setopt($curl, CURLOPT_PUT, true); curl_setopt($curl, CURLOPT_INFILE, $fileHandle); curl_setopt($curl, CURLOPT_INFILESIZE, filesize($localFile)); // Execute request curl_exec($curl); fclose($fileHandle); if (curl_errno($curl) == 0) { $status_code = curl_getinfo($curl, CURLINFO_HTTP_CODE); if ($status_code == 200) { // Read all template texts $templateText = file_get_contents($_FILES["fileTemplate"]["tmp_name"]); // 3. PARSE UPLOADED PDF DOCUMENT ParseDocument($apiKey, $uploadedFileUrl, $templateText); } else { // Display request error echo "<p>Status code: " . $status_code . "</p>"; echo "<p>" . $result . "</p>"; } } else { // Display CURL error echo "Error: " . curl_error($curl); } } else { // Display service reported error echo "<p>Status code: " . $status_code . "</p>"; echo "<p>" . $result . "</p>"; } curl_close($curl); } else { // Display CURL error echo "Error: " . curl_error($curl); } function ParseDocument($apiKey, $uploadedFileUrl, $templateText) { // (!) Make asynchronous job $async = TRUE; // Prepare URL for Document parser API call. // See documentation: https://apidocs.pdf.co/?#1-pdfdocumentparser $url = "https://api.pdf.co/v1/pdf/documentparser"; // Prepare requests params $parameters = array(); $parameters["url"] = $uploadedFileUrl; $parameters["template"] = $templateText; $parameters["async"] = $async; // Create Json payload $data = json_encode($parameters); // Create request $curl = curl_init(); curl_setopt($curl, CURLOPT_HTTPHEADER, array("x-api-key: " . $apiKey, "Content-type: application/json")); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_POST, true); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_POSTFIELDS, $data); // Execute request $result = curl_exec($curl); echo $result . "<br/>"; if (curl_errno($curl) == 0) { $status_code = curl_getinfo($curl, CURLINFO_HTTP_CODE); if ($status_code == 200) { $json = json_decode($result, true); if ($json["error"] == false) { // URL of generated JSON file that will available after the job completion $resultFileUrl = $json["url"]; // Asynchronous job ID $jobId = $json["jobId"]; // Check the job status in a loop do { $status = CheckJobStatus($jobId, $apiKey); // Possible statuses: "working", "failed", "aborted", "success". // Display timestamp and status (for demo purposes) echo "<p>" . date(DATE_RFC2822) . ": " . $status . "</p>"; if ($status == "success") { // Display link to JSON file with information about parsed fields echo "<div><h2>Parsing Result:</h2><a href='" . $resultFileUrl . "' target='_blank'>" . $resultFileUrl . "</a></div>"; break; } else if ($status == "working") { // Pause for a few seconds sleep(3); } else { echo $status . "<br/>"; break; } } while (true); } else { // Display service reported error echo "<p>Error: " . $json["message"] . "</p>"; } } else { // Display request error echo "<p>Status code: " . $status_code . "</p>"; echo "<p>" . $result . "</p>"; } } else { // Display CURL error echo "Error: " . curl_error($curl); } } function CheckJobStatus($jobId, $apiKey) { $status = null; // Create URL $url = "https://api.pdf.co/v1/job/check"; // Prepare requests params $parameters = array(); $parameters["jobid"] = $jobId; // Create Json payload $data = json_encode($parameters); // Create request $curl = curl_init(); curl_setopt($curl, CURLOPT_HTTPHEADER, array("x-api-key: " . $apiKey, "Content-type: application/json")); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_POST, true); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_POSTFIELDS, $data); // Execute request $result = curl_exec($curl); if (curl_errno($curl) == 0) { $status_code = curl_getinfo($curl, CURLINFO_HTTP_CODE); if ($status_code == 200) { $json = json_decode($result, true); if ($json["error"] == false) { $status = $json["status"]; } else { // Display service reported error echo "<p>Error: " . $json["message"] . "</p>"; } } else { // Display request error echo "<p>Status code: " . $status_code . "</p>"; echo "<p>" . $result . "</p>"; } } else { // Display CURL error echo "Error: " . curl_error($curl); } // Cleanup curl_close($curl); return $status; } ?> </body> </html>

VIDEO

ON-PREMISE OFFLINE SDK

Get 60 Day Free Trial

See also:

ON-DEMAND REST WEB API

Get Your API Key

See also:

Related Samples: