Hi, I have been using PDFNet to Search for text and highlight as needed.
The text that I search for may contain newline or carriage return characters.
Some of the PDF documents are not able to highlight properly in case of line breaks.
Here is the code:
internal static List<nENTITIES.Coordinate> GetPDFCoordinate(string pdfLocation, nENTITIES.SnippetCollection snptColl)
{
List<nENTITIES.Coordinate> lstOrdinates = new List<nENTITIES.Coordinate>();
WebClient downloadClient = new WebClient();
byte[] byteContent = downloadClient.DownloadData(pdfLocation);
PDFDoc doc = new PDFDoc(byteContent, byteContent.Length);
doc.InitSecurityHandler();
try
{
using (doc)
{
Int32 page_num = 0;
String result_str = "", ambient_string = "";
Highlights hlts = new Highlights();
TextSearch txt_search = new TextSearch();
Int32 mode = (Int32)(TextSearch.SearchMode.e_reg_expression | TextSearch.SearchMode.e_page_stop | TextSearch.SearchMode.e_highlight | TextSearch.SearchMode.e_whole_word);
foreach (var snippet in snptColl.SnippetData.Single().Match)
{
if (!string.IsNullOrEmpty(snippet.Text))
{
snippet.TextLeft = snippet.TextLeft.Replace(@"\", @"\\");
snippet.TextLeft = snippet.TextLeft.Replace("?", "");
snippet.TextLeft = snippet.TextLeft.Replace("(", "\\(");
snippet.TextLeft = snippet.TextLeft.Replace(")", "\\)");
snippet.TextLeft = snippet.TextLeft.Replace("+", "\\+");
snippet.TextLeft = snippet.TextLeft.Replace("*", "\\*");
snippet.TextLeft = snippet.TextLeft.Replace("^", "\\^");
snippet.TextLeft = snippet.TextLeft.Replace("$", "\\$");
snippet.TextLeft = snippet.TextLeft.Replace("|", "\\|");
snippet.TextLeft = snippet.TextLeft.Replace("[", "\\[");
snippet.TextLeft = snippet.TextLeft.Replace("{", "\\{");
snippet.TextLeft = snippet.TextLeft.Replace("}", "\\}");
snippet.TextRight = snippet.TextRight.Replace(@"\", @"\\");
snippet.TextRight = snippet.TextRight.Replace("?", "");
snippet.TextRight = snippet.TextRight.Replace("(", "\\(");
snippet.TextRight = snippet.TextRight.Replace(")", "\\)");
snippet.TextRight = snippet.TextRight.Replace("+", "\\+");
snippet.TextRight = snippet.TextRight.Replace("*", "\\*");
snippet.TextRight = snippet.TextRight.Replace("^", "\\^");
snippet.TextRight = snippet.TextRight.Replace("$", "\\$");
snippet.TextRight = snippet.TextRight.Replace("|", "\\|");
snippet.TextRight = snippet.TextRight.Replace("[", "\\[");
snippet.TextRight = snippet.TextRight.Replace("{", "\\{");
snippet.TextRight = snippet.TextRight.Replace("}", "\\}");
string keyword = snippet.HighLight;
string pattern = string.Empty;
int flag = 0;
//pattern = "(?<=" + snippet.Text + ")" + keyword;
if (string.IsNullOrEmpty(snippet.TextRight))
{
flag = 1;
pattern = "(?<=" + snippet.TextLeft + ")" + keyword;
}
else if (string.IsNullOrEmpty(snippet.TextLeft))
{
flag = 2;
pattern = keyword + "(?=" + snippet.TextRight + ")";
}
else
{
pattern = "(?<=" + snippet.TextLeft + ")" + keyword + "(?=" + snippet.TextRight + ")";
}
////call Begin() method to initialize the text search.
txt_search.Begin(doc, pattern, mode, -1, -1);
bool done = false;
while (!done)
{
TextSearch.ResultCode code = txt_search.Run(ref page_num, ref result_str, ref ambient_string, hlts);
switch (code)
{
case TextSearch.ResultCode.e_found:
hlts.Begin(doc);
while (hlts.HasNext())
{
Page cur_page = doc.GetPage(hlts.GetCurrentPageNumber());
double[] quads = hlts.GetCurrentQuads();
int quad_count = quads.Length / 8;
for (int i = 0; i < quad_count; ++i)
{
//assume each quad is an axis-aligned rectangle
int offset = 8 * i;
double x1 = Math.Min(Math.Min(Math.Min(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
double x2 = Math.Max(Math.Max(Math.Max(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
double y1 = Math.Min(Math.Min(Math.Min(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
double y2 = Math.Max(Math.Max(Math.Max(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
if (flag == 1)
{
lstOrdinates.Add(new nENTITIES.Coordinate { PageNumber = hlts.GetCurrentPageNumber(), X1 = x1, X2 = x2, Y1 = y1, Y2 = y2, Snippet = snippet.TextLeft + "<b>" + keyword.ToLower() + "</b>" });
}
else if (flag == 2)
{
lstOrdinates.Add(new nENTITIES.Coordinate { PageNumber = hlts.GetCurrentPageNumber(), X1 = x1, X2 = x2, Y1 = y1, Y2 = y2, Snippet = snippet.TextRight + "<b>" + keyword.ToLower() + "</b>" });
}
else
{
lstOrdinates.Add(new nENTITIES.Coordinate { PageNumber = hlts.GetCurrentPageNumber(), X1 = x1, X2 = x2, Y1 = y1, Y2 = y2, Snippet = snippet.TextLeft + "<b>" + keyword.ToLower() + "</b>" + snippet.TextRight });
}
}
hlts.Next();
}
break;
case TextSearch.ResultCode.e_done:
done = true;
break;
case TextSearch.ResultCode.e_page:
break;
default:
break;
}
}
}
}
}
}
catch (PDFNetException ex)
{
_mhcLogger.Fatal(ex.Message, ex, _currentMethodName);
throw new ApplicationException(nCOMMON.ErrorCode.c_UNEXPECTED_SYSTEM_ERROR.ToString(), ex);
}
catch (Exception ex)
{
_mhcLogger.Fatal(ex.Message, ex, _currentMethodName);
}
Now when I have the search text as (?<=wood. H. Trim Members For Replacement Windows: 1. Trim members for )vinyl clad window where in the PDF there is a line break after "wood. " and after “Windows:”