How to search and highlight text using PDFNet?

Aaron_Gravesdale · August 8, 2007, 1:19am

Q:

We have a requirement to stream PDF documents which will open in the
clients browser with search results highlighted and indexed and at the
location of a user selected search hit.

Does PDFNet SDK provide API for a web service to open a pdf document,
insert the highlight information, open at page information, etc and
then stream this edited document to the client? Does it require
copying the file to disk in order to make these edits?
-----------
A:

You can use PDFNet SDK (www.pdftron.com/net) to implement this
functionality. The edited file doesn't need to be saved to a temporary
file (i.e. it can be saved to memory).

Attached to this message is sample code that illustrates how to
programmatically highlight text. The sample is using TextExtractor to
extract words and PDFDraw class to rasterize pages with highlight
annotations. The sample also saves modified PDF document that includes
highlighted text.

To run the sample you could simply replace TextExtract sample (or
comment out the old version) in PDFNet/Samples folder (You would also
need to download the latest PDFNet demo preview:
www.pdftron.com/downloads/PDFNetPreviewDemo.zip). The sample is
implemented in C#, but the same functionality can be easily traslated
to VB.NET or C++.

For an example of how to save the modified document to a memory buffer
instead of a file, please take a look at the PDFDocMemory sample
project (www.pdftron.com/net/samplecode.html#PDFDocMemory).

//---------------------------------------------------
// The following sample illustrates how to programmatically highlight
text.
// The sample is using TextExtractor to extract words and PDFDraw
class to
// rasterize pages with highlight annotations. The sample also saves
modified
// PDF documents that includes highlighted text.
//
// If you are looking for interactive text selection and highlighting,
PDFView
// class already includes built-in tool modes for text search and
highlighting.
// For a concrete example of how to use these functions, please take a
look at
// the latest version of PDFView sample project.
//---------------------------------------------------

using System;
using pdftron;
using pdftron.Common;
using pdftron.Filters;
using pdftron.SDF;
using pdftron.PDF;

namespace TextHighlightTestCS
{
class PDFTextHighligh
{
  // Use PDFNet to generate appearance stream for highlight
annotation.
  static Obj CreateHighlightAppearance(PDFDoc doc, Rect bbox, ColorPt
higlight_color)
  {
   // Create a button appearance stream
------------------------------------
   ElementBuilder build = new ElementBuilder();
   ElementWriter writer = new ElementWriter();
   writer.Begin(doc);

   // Draw background
   Element element = build.CreateRect(bbox.x1 - 2, bbox.y1, bbox.x2 +
2, bbox.y2);
   element.SetPathFill(true);
   element.SetPathStroke(false);
   GState gs = element.GetGState();
   gs.SetFillColorSpace(ColorSpace.CreateDeviceRGB());
   gs.SetFillColor(higlight_color);
   gs.SetBlendMode(GState.BlendMode.e_bl_multiply);
   writer.WriteElement(element);
   Obj stm = writer.End();

   // Set the bounding box
   stm.Put("BBox", Rect.CreateSDFRect(bbox));
   stm.Put("Subtype", Obj.CreateName("Form"));
   return stm;
  }

  // Create Highlight Annotation.
  static Annot CreateHighlightAnnot(PDFDoc doc, Rect bbox, ColorPt
highlight_color)
  {
   Annot a = Annot.Create(doc, Annot.Type.e_Highlight, bbox);
   a.SetColor(highlight_color);
   a.SetAppearance(CreateHighlightAppearance(doc, bbox,
highlight_color));

   Obj quads = Obj.CreateArray();
   a.GetSDFObj().Put("QuadPoints", quads);
   quads.PushBack(Obj.CreateNumber(bbox.x1));
   quads.PushBack(Obj.CreateNumber(bbox.y2));
   quads.PushBack(Obj.CreateNumber(bbox.x2));
   quads.PushBack(Obj.CreateNumber(bbox.y2));
   quads.PushBack(Obj.CreateNumber(bbox.x1));
   quads.PushBack(Obj.CreateNumber(bbox.y1));
   quads.PushBack(Obj.CreateNumber(bbox.x2));
   quads.PushBack(Obj.CreateNumber(bbox.y1));
   return a;
  }

  static void Main(string[] args)
  {
   PDFNet.Initialize();
   PDFNet.SetResourcesPath("../../../../../resources");

   // Relative path to the folder containing test files.
   const string input_path = "../../../../TestFiles/";
   const string output_path = "../../../../TestFiles/Output/";

   try
   {
    PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf");
    doc.InitSecurityHandler();

// Highlight all "Robin" instances in the input document.

ColorPt highlight_color = new ColorPt(1, 1, 0); // Yellow

TextExtractor txt = new TextExtractor(); // Used to extract words
Rect word_bbox = new Rect();

PDFDraw pdfdraw = new PDFDraw(96); // Used to export PDF pages to
bitmap.

    PageIterator itr = doc.PageBegin();
    PageIterator end = doc.PageEnd();
    for (; itr != end; itr.Next())
    {
     Page page = itr.Current();
     txt.Begin(page); // Read the page.

     // Example 2. Extract words one by one.
     String word = txt.GetNextWord(word_bbox);
     for (; word != null; word = txt.GetNextWord(word_bbox))
     {
      word = word.ToUpper(); // For case-insensitive search.
      if (word.StartsWith("ROBIN") || word.EndsWith("ROBIN"))
      {
       // Console.WriteLine("{0} \t bbox: {1}, {2}, {3}, {4}\n",
         word, word_bbox.x1, word_bbox.y1, word_bbox.x2,
word_bbox.y2);
       page.AnnotPushBack(CreateHighlightAnnot(doc, word_bbox,
highlight_color));
      }
     }

     string outname = string.Format("{0}out{1:d}.jpg", output_path,
itr.GetPageNumber());
     Console.WriteLine(outname);
     pdfdraw.Export(page, outname, "jpg");
    }

    doc.Save(output_path + "output.pdf",
Doc.SaveOptions.e_linearized);
    doc.Close();
    Console.WriteLine("Done.");
   }
   catch (PDFNetException e)
   {
    Console.WriteLine(e.Message);
   }
  }
}
}

Aaron_Gravesdale · August 8, 2007, 4:26pm

Q:

If the highlighted document is rasterized, should I anticipate that
search functionality on the client is disabled or will all acrobat
reader functions still work.

A:

In the sample code PDF pages are rasterized only for illustration
purposes, because some users may want to generate raster images with
highlighted text instead of PDF. In case you don’t need this
functionality you can simply comment out all lines related to
‘pdfdraw’. The sample also shows how you can save text highlights as
part of a PDF document. If your web service delivers PDF with
highlights generated using PDFNet, all Acrobat Reader functions will
work as expected.