My PDF input file has text that extends past the page crop box. Text search and extraction is not finding or extraction the text, how do I find text such as this?
Text search and extraction is limited by the content inside the crop box, content outside it’s boundaries will not be found. What you can do is temporarily expand the media and crop box for a page and perform the search/extraction. The sample here will expand the media/crop box with an applied margin. You will need to judge the margin for your use case, some badly formatted PDFs might have content well outside the crop box. (The same method would work for text extraction.)
try
{
using (PDFDoc pdf_doc = new PDFDoc("[INPUT_FILE]))
{
pdf_doc.InitSecurityHandler();
// Expanding all pages but you could just do the one you need.
ExpandPageBoxes(pdf_doc);
// TextSearch - goes straight to page 3.
Int32 page_num = 0;
String result_str = "", ambient_string = "";
Highlights hlts = new Highlights();
TextSearch txt_search = new TextSearch();
txt_search.Begin(pdf_doc, text_search_pattern, mode, 3, 3);
while (true)
{
TextSearch.ResultCode code = txt_search.Run(ref page_num, ref result_str, ref ambient_string, hlts);
if (code == TextSearch.ResultCode.e_found)
{
Console.WriteLine($"Found: {result_str} on Page {page_num}");
}
else
{
break;
}
}
}
}
catch (Exception e)
{
Console.WriteLine(e.Message);
}
static void ExpandPageBoxes(PDFDoc PDF)
{
var original_rects = new Dictionary<int, (Rect media, Rect crop)>();
// You should adjust this to your need.
const double margin = 50.0;
// Expand boxes to cover visible content (content bbox) + margin
for (PageIterator itr = PDF.GetPageIterator(); itr.HasNext(); itr.Next())
{
Page p = itr.Current();
int index = p.GetIndex();
var origMedia = p.GetMediaBox();
var origCrop = p.GetCropBox();
original_rects[index] = (origMedia, origCrop);
// Union of the media/crop box with the visible content and a small margin specified above.
var content = p.GetVisibleContentBox(); // may be empty if page has no visible content
var target = UnionNonEmptyBoxes(origCrop, content);
var expanded = new Rect(
target.x1 - margin, target.y1 - margin,
target.x2 + margin, target.y2 + margin);
// Apply to both Media and Crop so search/view consider the area "on page"
p.SetBox(Page.Box.e_media, expanded);
p.SetBox(Page.Box.e_crop, expanded);
}
}
static Rect UnionNonEmptyBoxes(Rect First, Rect Second)
{
bool first_empty = (First.Width() <= 0) || (First.Height() <= 0);
bool second_empty = (Second.Width() <= 0) || (Second.Height() <= 0);
if (first_empty && !second_empty) return Second;
if (!first_empty && second_empty) return First;
if (first_empty && second_empty) return First;
return new Rect(
Math.Min(First.x1, Second.x1),
Math.Min(First.y1, Second.y1),
Math.Max(First.x2, Second.x2),
Math.Max(First.y2, Second.y2));
}