Extract Bold Text from PDF in JavaScript using PDF.co Web API
In this tutorial, we'll show you how to extract bold text from a PDF using PDF.co Web API in Asynchronous mode with JavaScript. This approach is particularly useful for handling large PDFs as it avoids timeouts and allows other operations to proceed while waiting for the conversion process to complete.
Step 1: Source Code
Open your text editor and create a JavaScript file with the following code:
var https = require("https");
var path = require("path");
var fs = require("fs");
// Import axios for making HTTP requests
const axios = require("axios");
// The authentication key (API Key).
// Get your own by registering at https://app.pdf.co
const API_KEY = "********************";
// Source PDF file
const SourceFile = "./sample.pdf";
// Comma-separated list of page indices (or ranges) to process. Leave empty for all pages. Example: '0,2-5,7-'.
const Pages = "";
// PDF document password. Leave empty for unprotected documents.
const Password = "";
// Destination XML file name
const DestinationFile = "./result.xml";
// 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
getPresignedUrl(API_KEY, SourceFile)
.then(([uploadUrl, uploadedFileUrl]) => {
// 2. UPLOAD THE FILE TO CLOUD.
uploadFile(API_KEY, SourceFile, uploadUrl)
.then(() => {
// 3. CONVERT UPLOADED PDF FILE TO XML
convertPdfToXml(API_KEY, uploadedFileUrl, Password, Pages, DestinationFile);
})
.catch(e => {
console.log(e);
});
})
.catch(e => {
console.log(e);
});
function getPresignedUrl(apiKey, localFile) {
return new Promise((resolve, reject) => {
// Prepare request to `Get Presigned URL` API endpoint
let queryPath = `/v1/file/upload/get-presigned-url?contenttype=application/octet-stream&name=${path.basename(SourceFile)}`;
let reqOptions = {
host: "api.pdf.co",
path: encodeURI(queryPath),
headers: { "x-api-key": API_KEY }
};
// Send request
https.get(reqOptions, (response) => {
response.on("data", (d) => {
let data = JSON.parse(d);
if (data.error === false) {
// Return presigned url we received
resolve([data.presignedUrl, data.url]);
} else {
// Service reported error
console.log("getPresignedUrl(): " + data.message);
reject(data.message);
}
});
})
.on("error", (e) => {
// Request error
console.log("getPresignedUrl(): " + e);
reject(e);
});
});
}
function uploadFile(apiKey, localFile, uploadUrl) {
return new Promise((resolve, reject) => {
fs.readFile(SourceFile, (err, data) => {
if (err) {
console.log("Error reading file: ", err);
reject(err);
return;
}
// Use axios to upload the file
axios.put(uploadUrl, data, {
headers: {
"Content-Type": "application/octet-stream"
}
})
.then(() => {
resolve();
})
.catch((err) => {
console.log("uploadFile() error: " + err);
reject(err);
});
});
});
}
function convertPdfToXml(apiKey, uploadedFileUrl, password, pages, destinationFile) {
// Prepare request to `PDF To XML` API endpoint
var queryPath = `/v1/pdf/convert/to/xml`;
// JSON payload for api request
var jsonPayload = JSON.stringify({
name: path.basename(destinationFile), password: password, pages: pages, url: uploadedFileUrl, async: true
});
var reqOptions = {
host: "api.pdf.co",
method: "POST",
path: queryPath,
headers: {
"x-api-key": API_KEY,
"Content-Type": "application/json",
"Content-Length": Buffer.byteLength(jsonPayload, 'utf8')
}
};
// Send request
var postRequest = https.request(reqOptions, (response) => {
response.on("data", (d) => {
response.setEncoding("utf8");
// Parse JSON response
let data = JSON.parse(d);
if (data.error === false) {
// Process returned job
console.log(`Job #${data.jobId} has been created!`);
checkIfJobIsCompleted(data.jobId, data.url, destinationFile);
} else {
// Service reported error
console.log("convertPdfToXml(): " + data.message);
}
});
})
.on("error", (e) => {
// Request error
console.log("convertPdfToXml(): " + e);
});
// Write request data
postRequest.write(jsonPayload);
postRequest.end();
}
function checkIfJobIsCompleted(jobId, resultFileUrl, destinationFile) {
let queryPath = `/v1/job/check`;
// JSON payload for api request
let jsonPayload = JSON.stringify({
jobid: jobId
});
let reqOptions = {
host: "api.pdf.co",
path: queryPath,
method: "POST",
headers: {
"x-api-key": API_KEY,
"Content-Type": "application/json",
"Content-Length": Buffer.byteLength(jsonPayload, 'utf8')
}
};
// Send request
var postRequest = https.request(reqOptions, (response) => {
response.on("data", (d) => {
response.setEncoding("utf8");
// Parse JSON response
let data = JSON.parse(d);
console.log(`Checking Job #${jobId}, Status: ${data.status}, Time: ${new Date().toLocaleString()}`);
if (data.status === "working") {
// Check again after 3 seconds
setTimeout(function () {
checkIfJobIsCompleted(jobId, resultFileUrl, destinationFile);
}, 3000);
} else if (data.status === "success") {
// Download XML file
var file = fs.createWriteStream(destinationFile);
https.get(resultFileUrl, (response2) => {
response2.pipe(file)
.on("close", () => {
console.log(`Generated XML file saved as "${destinationFile}" file.`);
});
});
} else {
console.log(`Operation ended with status: "${data.status}".`);
}
});
});
// Write request data
postRequest.write(jsonPayload);
postRequest.end();
}
Step 2: Install Axios Module
Next, install the axios module for file upload. Type npm install axios
in the Terminal.
Step 3: PDF.co API Key
On line 12, add your API key inside the double quote. You can get the PDF.co API Key from your dashboard.
data:image/s3,"s3://crabby-images/93559/93559aa472911a1482bc0f657fef29d9a10cd533" alt="API Key"
Step 4: Source and Destination Files
- On line 15, replace the
SourceFile
variable with the path to your PDF. - On line 21, replace
DestinationFile
with the desired path for the generated XML file.
data:image/s3,"s3://crabby-images/2f721/2f721826e7384c6283cdc740d54cdf2c1c1b7851" alt="Source and Destination Files"
Step 5: Run the Program
- Save your JavaScript file as
index.js
. - Open a terminal, navigate to the project directory, and run:
node index.js
Step 6: How It Works
- Initiating an Asynchronous Conversion:
- The code sends a request to the PDF.co API to start a PDF-to-XML conversion with the
async
option set totrue
to enable asynchronous processing. - The API response contains a job ID and a URL to monitor the conversion status.
- The code sends a request to the PDF.co API to start a PDF-to-XML conversion with the
- Checking Job Status:
- The code uses the job ID to periodically query the job's status by sending requests to the API.
- Once the status changes to "
success
", it indicates that the conversion process is complete, and the result file is ready for download.
- Downloading the Result:
- After the conversion is successfully completed, the code retrieves the converted XML file from the provided URL and saves it to the specified location on the local system.
In this tutorial, you learned how to extract bold text from a PDF using PDF.co Web API and asynchronous mode. By implementing asynchronous processing, your application can efficiently handle large PDFs without running into timeouts. For more details on asynchronous processing with PDF.co, refer to the PDF.co documentation.
Related Tutorials
data:image/s3,"s3://crabby-images/708ab/708ab1fff1041b667446e8bda0ee2399b271ea6d" alt="Tutorial default thumbnail"
data:image/s3,"s3://crabby-images/708ab/708ab1fff1041b667446e8bda0ee2399b271ea6d" alt="Tutorial default thumbnail"
data:image/s3,"s3://crabby-images/708ab/708ab1fff1041b667446e8bda0ee2399b271ea6d" alt="Tutorial default thumbnail"
data:image/s3,"s3://crabby-images/708ab/708ab1fff1041b667446e8bda0ee2399b271ea6d" alt="Tutorial default thumbnail"