Q: I need to convert PDF to HTML by reserving the look of original PDF
document. At the same time I need ability to search and select text. I
saw some discussion in this group regarding this and I was able to
render the background image using 'pdftron.PDF.PDFDraw' and extract
text using pdftron.PDF.TextExtractor. Your engineer suggested two
things to do:
1. text conversion to html, see the code: (C#) http://www.pdftron.com/pdfnet/samplecode.html
look at text exract code
2. then for graphic use pdf draw to convert to html image. then to
erase any text background use the element edit feature.
Still I am not sure how to bring these together in HTML. My
requrements are
1. No we do not need any of the interactive features of PDF.
2. The text of the html need to be available as DOM elements and
international characters need to be html encoded.
4. For multiple page pdf documents all the pages need to be in the
same html file (basically so that once a single html file is loaded
all the pages are available one after another). But each page need to
marked with div so that we can implement ourselves a scroll to page
javascript.
5. I would prefer C# implementation, however JAVA/C/C++ is also ok.
----------------------------
A: There many ways you could implement PDF to HTML functionality using
PDFNet. The approach used in Google Docs is to render the entire page
as a bitmap (using PDFDraw) and the hiddent text layer (using
TextExtractor). Unfortunately it seems that this apprach would not
work for you due to requirement #2.
In this case you can generate a bitmap layer (with everything except
for text) and a text layer as shown in the following sample code
(http://groups.google.com/group/pdfnet-sdk/web/Pdf2Html.zip):
//
// Copyright (c) 2001-2010 by PDFTron Systems Inc. All Rights
Reserved.
//
using System;
using System.IO;
using System.Collections;
using W = System.Web.UI;
using pdftron;
using pdftron.Common;
using pdftron.Filters;
using pdftron.SDF;
using pdftron.PDF;
namespace pdftron
{
/// <summary>
/// PdfToHtml implements a PDF to HTML converter using PDFNet.
/// </summary>
class PdfToHtml
{
public void Convert(TextWriter wri, PDFDoc doc)
{
Convert(wri, doc, -1, 1, "");
}
public void Convert(TextWriter wri, PDFDoc doc, int page_number,
double zoom, string outpath)
{
_doc = doc;
_zoom = zoom;
_out_path = outpath;
using (W.HtmlTextWriter html = new W.HtmlTextWriter(wri))
{
html.RenderBeginTag(W.HtmlTextWriterTag.Html);
html.RenderBeginTag(W.HtmlTextWriterTag.Head);
html.AddAttribute("http-equiv", "Content-Type");
html.AddAttribute("content", "text/html");
html.AddAttribute("charset", html.Encoding.WebName);
html.RenderBeginTag(W.HtmlTextWriterTag.Meta);
html.RenderEndTag(); // </meta>
html.RenderBeginTag(W.HtmlTextWriterTag.Title);
html.Write("PDFTron PdfToHtml Sample");
html.RenderEndTag(); // </title>
html.RenderEndTag(); // </head>
html.RenderBeginTag(W.HtmlTextWriterTag.Body);
ArrayList html_page_list = new ArrayList();
_page_offset = 0;
if (page_number == -1)
{
for (PageIterator itr = doc.GetPageIterator(); itr.HasNext();
itr.Next())
{
_page_height = (int)(itr.Current().GetPageHeight() * _zoom);
html_page_list.Add(RenderHTMLBody(itr.Current()));
_page_offset += _page_height;
}
}
else
{
Page page = doc.GetPage(page_number);
_page_height = (int)(page.GetPageHeight() * _zoom);
html_page_list.Add(RenderHTMLBody(page));
}
// Write global CSS style section.
html.RenderBeginTag(W.HtmlTextWriterTag.Style);
html.AddAttribute("type", "text/css");
foreach (DictionaryEntry style in _style_map)
html.Write(".f" + style.Value + "{" + style.Key + "}\n");
html.RenderEndTag(); // </Style>
foreach (Object page in html_page_list)
html.Write(page);
html.RenderEndTag(); // </body>
html.RenderEndTag(); // </html>
wri.Close();
}
}
private string RenderHTMLBody(Page page)
{
StringWriter strw = new StringWriter();
using (W.HtmlTextWriter html = new W.HtmlTextWriter(strw, " "))
{
using (TextExtractor txt = new TextExtractor())
{
// write page size info
html.AddAttribute("id", page.GetIndex().ToString());
html.AddStyleAttribute("position", "absolute");
html.AddStyleAttribute("white-space", "nowrap");
html.AddStyleAttribute("color", "#000000");
html.AddStyleAttribute("top", _page_offset + "px");
html.AddStyleAttribute("left", "0px");
html.AddStyleAttribute("width", ((int)(page.GetPageWidth()*
_zoom)).ToString() + "px");
html.AddStyleAttribute("height", _page_height.ToString() + "px");
html.RenderBeginTag("div");
txt.Begin(page, null, TextExtractor.ProcessingFlags.e_none); //
txt.Begin(page, null,
TextExtractor.ProcessingFlags.e_remove_hidden_text);
TextExtractor.Style s, line_style;
// For each line on the page...
for (TextExtractor.Line line = txt.GetFirstLine();
line.IsValid(); line = line.GetNextLine())
{
// For now, skip rotated lines
if (!line.IsSimpleLine()) {
continue;
}
line_style = line.GetStyle();
Rect line_bbox = line.GetBBox();
// AddStyle(html, line_style, line_bbox, page, false);
// html.RenderBeginTag("span");
// For each word in the line...
bool first_word = true;
for (TextExtractor.Word word = line.GetFirstWord();
word.IsValid(); word = word.GetNextWord())
{
int sz = word.GetStringLen();
if (sz == 0) continue;
s = word.GetStyle();
Font f = new Font(s.GetFont());
if (f.GetName().IndexOf("ZapfDingbats") >= 0)
// f.IsSymbolic())
// don't output the word if it is using ZapfDingbats?
continue;
if (true) // s != line_style) // If the word style is different
from the parent style, output the a child span with a different style.
{
//html.RenderEndTag(); // </span - for line>
Rect word_bbox = word.GetBBox();
// Make the word box relative to the line.
//word_bbox.x1 -= line_bbox.x1; word_bbox.y1 -= line_bbox.y1;
//word_bbox.x2 -= line_bbox.x1; word_bbox.y2 -= line_bbox.y1;
AddStyle(html, s, word_bbox, page,
false);
html.RenderBeginTag("span"); // Use each word as a span
html.WriteEncodedText(word.GetString());
html.RenderEndTag(); // </span - for word>
//AddStyle(html, line_style, line_bbox, page, false);
//html.RenderBeginTag("span");
}
else
{
if (first_word) first_word = false;
else html.Write(" ");
html.WriteEncodedText(word.GetString());
}
}
// html.RenderEndTag(); // </span - for line>
}
}
if (_draw_back_image)
{
string img = RenderHTMLBackgroundImage(page);
if (img != "")
{
html.AddAttribute("src", img);
html.AddAttribute("alt", "background image");
html.RenderBeginTag("img");
html.RenderEndTag(); // </img>
}
}
html.RenderEndTag(); // </div>
return strw.ToString();
}
}
private string RenderHTMLBackgroundImage(Page page)
{
_reader = new ElementReader();
_writer = new ElementWriter();
_builder = new ElementBuilder();
Page new_page = _doc.PageCreate();
_writer.Begin(new_page);
_reader.Begin(page);
ProcessElements();
_writer.End();
_reader.End();
_writer.Dispose();
_reader.Dispose();
_builder.Dispose();
new_page.SetMediaBox(page.GetCropBox());
new_page.SetRotation(page.GetRotation());
string filename = "";
using (PDFDraw draw = new PDFDraw())
{
draw.SetDPI(_zoom * 72);
filename = String.Format("page{0}.png", page.GetIndex());
draw.Export(new_page, _out_path+filename, "png");
}
return filename;
}
private void AddStyle(W.HtmlTextWriter html, TextExtractor.Style s,
Rect bbox, Page page, bool relative)
{
double xpos = _zoom * bbox.x1, ypos;
if (relative)
{
ypos = _zoom * bbox.y1;
}
else
{
ypos = _zoom * (page.GetPageHeight() - bbox.y1 - bbox.Height());
}
html.AddStyleAttribute("top", ((int)ypos).ToString() + "px");
html.AddStyleAttribute("left", ((int)xpos).ToString() + "px");
html.AddStyleAttribute("width", ((int)(_zoom *
bbox.Width())).ToString() + "px");
string font_class = "position:" + (relative ? "relative" :
"absolute");
font_class += ";font-size:" + (_zoom *
s.GetFontSize()).ToString("G4");
if (s.GetColor().ToArgb() != System.Drawing.Color.Black.ToArgb())
font_class += ";color:" +
System.Drawing.ColorTranslator.ToHtml(s.GetColor());
// Trim away some characters from the font name that are not liked
by CSS.
string fnt = s.GetFontName();
int idx = fnt.IndexOf('-');
if (idx >= 0) fnt = fnt.Substring(0, idx);
fnt += ",ArialUnicode,Arial,Helvetica";
if (s.IsSerif()) fnt += ",sans-serif";
font_class += ";font-family:" + fnt;
if (s.IsItalic()) font_class += ";font-style:" + "italic";
font_class += ";font-weight" + s.GetWeight().ToString();
int class_id = 0;
if (!_style_map.ContainsKey(font_class))
{
class_id = _style_map.Count;
_style_map.Add(font_class, class_id);
}
else
{
class_id = (int) _style_map[font_class];
}
html.AddAttribute("class",
"f"+class_id.ToString());
}
Hashtable _style_map = new Hashtable();
private void ProcessElements()
{
Element element;
while ((element = _reader.Next()) != null)
{
switch (element.GetType())
{
case Element.Type.e_text:
{
bool output_as_bitmap = false;
Matrix2D mtx = element.GetCTM() * element.GetTextMatrix();
if (mtx.m_b != 0) // rotated or vertically skewed text ->
output as text.
output_as_bitmap = true;
Font f = element.GetGState().GetFont();
if (f.GetName().IndexOf("ZapfDingbats") >= 0)
// f.IsSymbolic())
output_as_bitmap = true;
if (output_as_bitmap) _writer.WriteElement(element);
else continue;
}
break;
case Element.Type.e_form:
// Save GState ...
Element e = _builder.CreateGroupBegin();
Matrix2D form_mtx = element.GetGState().GetTransform();
Obj m = element.GetXObject().FindObj("Matrix");
if (m != null) form_mtx.Concat(m.GetAt(0).GetNumber(),
m.GetAt(1).GetNumber(), m.GetAt(2).GetNumber(),
m.GetAt(3).GetNumber(), m.GetAt(4).GetNumber(),
m.GetAt(5).GetNumber());
e.GetGState().SetTransform(form_mtx);
_writer.WriteElement(e);
_reader.FormBegin();
// Output the clipping path for the Form XObject
Obj box_obj = element.GetXObject().FindObj("BBox");
if (box_obj != null)
{
Rect bbox = new Rect(box_obj);
Element clip = _builder.CreateRect(bbox.x1, bbox.y1,
bbox.Width(), bbox.Height());
clip.SetPathClip(true);
clip.SetPathFill(false);
clip.SetPathStroke(false);
_writer.WriteElement(clip);
}
ProcessElements();
// Restore the graphics state
_writer.WriteElement(_builder.CreateGroupEnd());
_reader.End();
break;
default:
_writer.WriteElement(element);
break;
}
}
}
private string _out_path;
private double _zoom = 1.0;
private ElementWriter _writer = null;
private ElementReader _reader = null;
private ElementBuilder _builder = null;
private PDFDoc _doc = null;
private int _page_offset, _page_height;
bool _draw_back_image = true;
static void Main(string[] args)
{
PDFNet.Initialize();
try
{
using (PDFDoc doc = new PDFDoc("../../../../TestFiles/
newsletter.pdf"))
{
doc.InitSecurityHandler();
string output_path = "";
using (TextWriter wri = File.CreateText(output_path +
"out.html"))
{
PdfToHtml pdf2html = new PdfToHtml();
pdf2html.Convert(wri, doc, -1, 96.0 / 72.0, output_path);
}
}
}
catch (PDFNetException e)
{
Console.WriteLine(e.Message);
}
Console.WriteLine("Done.");
}
}
}
--
You received this message because you are subscribed to the "PDFTron PDFNet SDK" group. To post to this group, send email to support@pdftron.com
To unsubscribe from this group, send email to pdfnet-sdk-unsubscribe@googlegroups.com. For more information, please visit us at http://www.pdftron.com