How to Skip Pages with No Images When Using OCR on PDFs

btompkinson1 · July 18, 2023, 4:25pm

When performing an OCR conversion of a PDF file there are often pages that do not contain images or that do not need to be processed by the OCR module. It is possible to skip certain pages using addIgnoreZonesForPage. Here is an Java example of checking each page of the input for an image and then skipping that page if an image is not found:

    private static void runOcrOnDocument(File pdfFile, String outputFile) throws PDFNetException, IOException {
        try {            
            FileInputStream inputStream = new FileInputStream(pdfFile);
            long startingTime = System.currentTimeMillis();
            PDFDoc doc = new PDFDoc(inputStream);
            int pageCount = doc.getPageCount();
            OCROptions options = new OCROptions();

            ArrayList<Integer> pages = findPagesWithImages(doc);
            for (int i = 1; i < pageCount; i++) {
                if (!pages.contains(i))
                {
                    Page pdfPage = doc.getPage(i);
                    Rect pdfPageRect = pdfPage.getMediaBox();
                    RectCollection pdfPageRectColl = new RectCollection();
                    pdfPageRectColl.addRect(pdfPageRect);
                    options.addIgnoreZonesForPage(pdfPageRectColl, i);
                }
            }
            OCRModule.processPDF(doc, options);
            doc.save(outputFile, SDFDoc.SaveMode.LINEARIZED, null);
            System.out.println("    Process Time: " + (System.currentTimeMillis() - startingTime) / 1000. + " s");
        } catch (IOException e) {
            e.printStackTrace();
        } catch (PDFNetException e) {
            throw new RuntimeException(e);
        }
        return;
    }

    public static ArrayList<Integer> findPagesWithImages(PDFDoc pdfDoc) throws PDFNetException{
        ElementReader reader = new ElementReader();
        ArrayList<Integer> pagesWithImages =  new ArrayList<Integer>(); 
        for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
            reader.begin(pdfDoc.getPage(pageId));
            if (doesPageContainImages(reader)) {
                pagesWithImages.add(pageId);
            }
        }
        return pagesWithImages;
    }

    public static boolean doesPageContainImages(ElementReader reader) throws PDFNetException{
        Element element;
        while ((element = reader.next()) != null) {
            switch (element.getType()) {
                case Element.e_image, Element.e_inline_image -> {
                    return true;
                }
            }
        }
        return false;
    }