Q:I am using the following VB snippet to extract text from PDF
documents within a Windows Service. Occasionally I get the following
exception:
pdftron.Common.PDFNetException: Unknown exception.
at pdftron.PDF.TextExtractor.Begin(Page page, Rect clip_ptr,
ProcessingFlagsflags)
at ConsoleApplication1.PDFTronExtractor.ExtractText(Page& page,
Rect& pos)
This app simulates what we actually do - A windows service that
monitors a queue, when an item enters the queue it retrieves the
appropriate file, and extracts the text from it.
Imports PDFTRON
Imports PDFTRON.PDF
Imports System.Drawing
''' <summary>
''' Extracts text from PDF files.
''' </summary>
Public Class PDFTronExtractor
#Region "Local Members"
Private mstrFilePath As String
Private mstrAddressLines() As String
Private mRect As Rectangle
#End Region
#Region "Properties"
''' <summary>
''' Returns the address lines extracted
''' </summary>
Public ReadOnly Property AddressLines() As String()
Get
Return mstrAddressLines
End Get
End Property
''' <summary>
''' The path of the PDF files.
''' </summary>
Public Property FilePath() As String
Get
Return mstrFilePath
End Get
Set(ByVal value As String)
mstrFilePath = value
End Set
End Property
Public Property Rectangle() As Rectangle
Get
Return mRect
End Get
Set(ByVal value As Rectangle)
mRect = value
End Set
End Property
Public Sub Dispose()
PDFTRON.PDFNet.Terminate()
End Sub
#End Region
#Region "Functions"
Public Sub New()
PDFNet.Initialize()
End Sub
''' <summary>
''' Extracts the text from the document.
''' </summary>
''' <remarks></remarks>
Public Function Extract() As Boolean
mstrAddressLines = Nothing
' Open the test file
Dim doc As PDFDoc = New PDFDoc(mstrFilePath)
doc.InitSecurityHandler()
Dim page As Page = doc.GetPage(1)
Dim r As Rect
With mRect
Dim x1, x2, y1, y2 As Double
x1 = .X
x2 = .X + .Width
y1 = page.GetPageHeight - (.Y + .Height)
y2 = y1 + .Height
r = New Rect(x1, y1, x2, y2)
End With
mstrAddressLines = ExtractText(page, r).Split(New String()
{vbNewLine}, StringSplitOptions.RemoveEmptyEntries)
doc.Close()
Extract = True
End Function
Private Function ExtractText(ByRef page As Page, ByRef pos As
Rect) As String
Dim sb As New System.Text.StringBuilder()
Dim te As New PDFTRON.PDF.TextExtractor()
Dim bNewLine As Boolean
te.Begin(page, pos,
PDF.TextExtractor.ProcessingFlags.e_no_invisible_text)
Dim w As PDFTRON.PDF.TextExtractor.Word
Dim line As PDFTRON.PDF.TextExtractor.Line = te.GetFirstLine
While line.IsValid
bNewLine = True
w = line.GetFirstWord
While w.IsValid
If Not bNewLine Then
sb.Append(" ")
End If
bNewLine = False
sb.Append(w.GetString)
w = w.GetNextWord
End While
sb.AppendLine()
line = line.GetNextLine
End While
te.Dispose() <---- ADD this line
Return sb.ToString
End Function
#End Region
End Class
Module Module1
Sub Main()
Const FilePath As String = "OpenOffice PDF.pdf"
Dim ext As New PDFTronExtractor
Try
Do
ext.FilePath = FilePath
ext.Extract()
Loop
Catch ex As Exception
Console.WriteLine(ex.ToString)
Finally
ext.Dispose()
End Try
Console.ReadLine()
End Sub
End Module
---------
A: The problem is that TextExtractors’s Dispose() method is not called
(te.Dispose() in ExtractText). Not calling Dispose() on TextExtractor/
ElementBuilder/ElementWriter/PDFView can lead to resource exhaustion.
You may want to use .NET IDispose pattern or 'use' keyword...