Extract Bold Text from PDF in JavaScript using PDF.co Web API

In this tutorial, we'll show you how to extract bold text from a PDF using PDF.co Web API in Asynchronous mode with JavaScript. This approach is particularly useful for handling large PDFs as it avoids timeouts and allows other operations to proceed while waiting for the conversion process to complete.

Step 1: Source Code

Open your text editor and create a JavaScript file with the following code:

var https = require("https");
var path = require("path");
var fs = require("fs");

// Import axios for making HTTP requests
const axios = require("axios");

// The authentication key (API Key).
// Get your own by registering at https://app.pdf.co
const API_KEY = "********************";

// Source PDF file
const SourceFile = "./sample.pdf";
// Comma-separated list of page indices (or ranges) to process. Leave empty for all pages. Example: '0,2-5,7-'.
const Pages = "";
// PDF document password. Leave empty for unprotected documents.
const Password = "";
// Destination XML file name
const DestinationFile = "./result.xml";

// 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
getPresignedUrl(API_KEY, SourceFile)
    .then(([uploadUrl, uploadedFileUrl]) => {
        // 2. UPLOAD THE FILE TO CLOUD.
        uploadFile(API_KEY, SourceFile, uploadUrl)
            .then(() => {
                // 3. CONVERT UPLOADED PDF FILE TO XML
                convertPdfToXml(API_KEY, uploadedFileUrl, Password, Pages, DestinationFile);
            })
            .catch(e => {
                console.log(e);
            });
    })
    .catch(e => {
        console.log(e);
    });

function getPresignedUrl(apiKey, localFile) {
    return new Promise((resolve, reject) => {
        // Prepare request to `Get Presigned URL` API endpoint
        let queryPath = `/v1/file/upload/get-presigned-url?contenttype=application/octet-stream&name=${path.basename(SourceFile)}`;
        let reqOptions = {
            host: "api.pdf.co",
            path: encodeURI(queryPath),
            headers: { "x-api-key": API_KEY }
        };
        // Send request
        https.get(reqOptions, (response) => {
            response.on("data", (d) => {
                let data = JSON.parse(d);
                if (data.error === false) {
                    // Return presigned url we received
                    resolve([data.presignedUrl, data.url]);
                } else {
                    // Service reported error
                    console.log("getPresignedUrl(): " + data.message);
                    reject(data.message);
                }
            });
        })
            .on("error", (e) => {
                // Request error
                console.log("getPresignedUrl(): " + e);
                reject(e);
            });
    });
}

function uploadFile(apiKey, localFile, uploadUrl) {
    return new Promise((resolve, reject) => {
        fs.readFile(SourceFile, (err, data) => {
            if (err) {
                console.log("Error reading file: ", err);
                reject(err);
                return;
            }

            // Use axios to upload the file
            axios.put(uploadUrl, data, {
                headers: {
                    "Content-Type": "application/octet-stream"
                }
            })
                .then(() => {
                    resolve();
                })
                .catch((err) => {
                    console.log("uploadFile() error: " + err);
                    reject(err);
                });
        });
    });
}

function convertPdfToXml(apiKey, uploadedFileUrl, password, pages, destinationFile) {
    // Prepare request to `PDF To XML` API endpoint
    var queryPath = `/v1/pdf/convert/to/xml`;

    // JSON payload for api request
    var jsonPayload = JSON.stringify({
        name: path.basename(destinationFile), password: password, pages: pages, url: uploadedFileUrl, async: true
    });

    var reqOptions = {
        host: "api.pdf.co",
        method: "POST",
        path: queryPath,
        headers: {
            "x-api-key": API_KEY,
            "Content-Type": "application/json",
            "Content-Length": Buffer.byteLength(jsonPayload, 'utf8')
        }
    };

    // Send request
    var postRequest = https.request(reqOptions, (response) => {
        response.on("data", (d) => {
            response.setEncoding("utf8");
            // Parse JSON response
            let data = JSON.parse(d);
            if (data.error === false) {
                // Process returned job
                console.log(`Job #${data.jobId} has been created!`);
                checkIfJobIsCompleted(data.jobId, data.url, destinationFile);
            } else {
                // Service reported error
                console.log("convertPdfToXml(): " + data.message);
            }
        });
    })
        .on("error", (e) => {
            // Request error
            console.log("convertPdfToXml(): " + e);
        });

    // Write request data
    postRequest.write(jsonPayload);
    postRequest.end();
}

function checkIfJobIsCompleted(jobId, resultFileUrl, destinationFile) {
    let queryPath = `/v1/job/check`;

    // JSON payload for api request
    let jsonPayload = JSON.stringify({
        jobid: jobId
    });

    let reqOptions = {
        host: "api.pdf.co",
        path: queryPath,
        method: "POST",
        headers: {
            "x-api-key": API_KEY,
            "Content-Type": "application/json",
            "Content-Length": Buffer.byteLength(jsonPayload, 'utf8')
        }
    };

    // Send request
    var postRequest = https.request(reqOptions, (response) => {
        response.on("data", (d) => {
            response.setEncoding("utf8");

            // Parse JSON response
            let data = JSON.parse(d);
            console.log(`Checking Job #${jobId}, Status: ${data.status}, Time: ${new Date().toLocaleString()}`);

            if (data.status === "working") {
                // Check again after 3 seconds
                setTimeout(function () {
                    checkIfJobIsCompleted(jobId, resultFileUrl, destinationFile);
                }, 3000);
            } else if (data.status === "success") {
                // Download XML file
                var file = fs.createWriteStream(destinationFile);
                https.get(resultFileUrl, (response2) => {
                    response2.pipe(file)
                        .on("close", () => {
                            console.log(`Generated XML file saved as "${destinationFile}" file.`);
                        });
                });
            } else {
                console.log(`Operation ended with status: "${data.status}".`);
            }
        });
    });

    // Write request data
    postRequest.write(jsonPayload);
    postRequest.end();
}

Step 2: Install Axios Module

Next, install the axios module for file upload. Type npm install axios in the Terminal.

Step 3: PDF.co API Key

On line 12, add your API key inside the double quote. You can get the PDF.co API Key from your dashboard.

API Key

Step 4: Source and Destination Files

  • On line 15, replace the SourceFile variable with the path to your PDF.
  • On line 21, replace DestinationFile with the desired path for the generated XML file.
Source and Destination Files

Step 5: Run the Program

  • Save your JavaScript file as index.js.
  • Open a terminal, navigate to the project directory, and run: node index.js

Step 6: How It Works

  1. Initiating an Asynchronous Conversion:
    • The code sends a request to the PDF.co API to start a PDF-to-XML conversion with the async option set to true to enable asynchronous processing.
    • The API response contains a job ID and a URL to monitor the conversion status.
  2. Checking Job Status:
    • The code uses the job ID to periodically query the job's status by sending requests to the API.
    • Once the status changes to "success", it indicates that the conversion process is complete, and the result file is ready for download.
  3. Downloading the Result:
    • After the conversion is successfully completed, the code retrieves the converted XML file from the provided URL and saves it to the specified location on the local system.

In this tutorial, you learned how to extract bold text from a PDF using PDF.co Web API and asynchronous mode. By implementing asynchronous processing, your application can efficiently handle large PDFs without running into timeouts. For more details on asynchronous processing with PDF.co, refer to the PDF.co documentation.

Related Tutorials

See Related Tutorials