Extracting image 'Alt' descriptions / logical structure.

Aaron_Gravesdale · May 25, 2007, 1:16am

Q:

I am aware that sometimes images are tagged with "alt" descriptions to
help the image be more accessible. Have you any idea on how I can
access this information? Also if you know of any more areas where we
can read accessibility/meta data about objects that you be useful
too.
----
A:

You can access this metadata (if it is available) using 'marked
content' (i.e. by processing e_marked_content_begin/
e_marked_content_end 'Element' that should surround the image object).
When you encounter a marked content element you can access its
property dictionary using element.GetMCProperyDict().

PDFNet SDK v3.7 (which will be released soon) also supports new high-
level API-s for logical structure extraction, creation, and editing.
The following is a new sample project that illustrates the use of this
API to extract logical structure from 'tagged' PDF documents.

//---------------------------------------------------------------------------------------
// This sample explores the structure and content of a tagged PDF
document and dumps
// the structure information to the console window.
//
// In tagged PDF documents StructTree acts as a central repository for
information
// related to a PDF document's logical structure. The tree consists of
StructElement-s
// and ContentItem-s which are leaf nodes of the structure tree.
//
// The sample can be extended to access and extract the marked-content
elements such
// as text and images.
//---------------------------------------------------------------------------------------

void PrintIdent(int ident) { cout << '\n'; for (int i=0; i<ident; ++i)
cout << " "; }

// Used in code snippet 1.
void ProcessStructElement(Struct::SElement element, int ident)
{
  if (!element.IsValid()) {
    return;
  }

  // Print out the type and title info, if any.
  PrintIdent(ident++);
  cout << "Type: "<< element.GetType();
  if (element.HasTitle()) {
    cout << ". Title: "<< element.GetTitle();
  }

  int num = element.GetNumKids();
  for (int i=0; i<num; ++i)
  {
    // Check is the kid is a leaf node (i.e. it is a ContentItem).
    if (element.IsContentItem(i)) {
      Struct::ContentItem cont = element.GetAsContentItem(i);
      Struct::ContentItem::Type type = cont.GetType();

Page page = cont.GetPage();

PrintIdent(ident);
cout << "Content Item. Part of page #" << page.GetIndex();

      PrintIdent(ident);
      switch (type) {
        case Struct::ContentItem::e_MCID:
        case Struct::ContentItem::e_MCR:
          cout << "MCID: " << cont.GetMCID();
          break;
        case Struct::ContentItem::e_OBJR:
          {
            cout << "OBJR ";
            if (SDF::Obj* ref_obj = cont.GetRefObj())
              cout << "- Referenced Object#: " << ref_obj->GetObjNum();
          }
          break;
        default:
          break;
      }
    }
    else { // the kid is another StructElement node.
      ProcessStructElement(element.GetAsStructElem(i), ident);
    }
  }
}

// Used in code snippet 2.
void ProcessElements(ElementReader& reader)
{
    Element* element;
  while (element = reader.Next()) // Read page contents
  {
    // In this sample we process only paths & text, but the code can be
    // extended to handle any element type.
    Element::Type type = element->GetType();
    if (type == Element::e_path || type == Element::e_text || type ==
Element::e_path)
    {
      switch (type) {
      case Element::e_path: // Process path ...
        cout << "\nPATH: ";
         break;
      case Element::e_text: // Process text ...
        cout << "\nTEXT: " << *element->GetTextString() << "\n ";
        break;
      case Element::e_form: // Process form XObjects
        cout << "\nFORM XObject: ";
        //reader.FormBegin();
        //ProcessElements(reader);
        //reader.End();
        break;
      }

      // Check if the element is associated with any structural element.
      // Content items are leaf nodes of the structure tree.
      Struct::SElement struct_parent = element->GetParentStructElement();
      if (struct_parent.IsValid()) {
        // Print out the parent structural element's type, title, and
object number.
        cout << " Type: " << struct_parent.GetType()
          << ", MCID: " << element->GetStructMCID();
        if (struct_parent.HasTitle()) {
          cout << ". Title: "<< struct_parent.GetTitle();
        }
        cout << ", Obj#: " << struct_parent.GetSDFObj()->GetObjNum();
      }
    }
  }
}

// Used in code snippet 3.
typedef map<int, string> MCIDPageMap;
typedef map<int, MCIDPageMap> MCIDDocMap;

// Used in code snippet 3.
void ProcessElements2(ElementReader& reader, MCIDPageMap&
mcid_page_map)
{
    Element* element;
  while (element = reader.Next()) // Read page contents
  {
    // In this sample we process only text, but the code can be
extended
    // to handle paths, images, or any other Element type.
    int mcid = element->GetStructMCID();
    if (mcid>= 0 && element->GetType() == Element::e_text) {
      string val = element->GetTextString()->ConvertToAscii();
      MCIDPageMap::iterator itr = mcid_page_map.find(mcid);
      if (itr != mcid_page_map.end()) itr->second += val;
      else mcid_page_map.insert(MCIDPageMap::value_type(mcid, val));
    }
  }
}

// Used in code snippet 3.
void ProcessStructElement2(Struct::SElement element, MCIDDocMap&
mcid_doc_map, int ident)
{
  if (!element.IsValid()) {
    return;
  }

  // Print out the type and title info, if any.
  PrintIdent(ident);
  cout << "<" << element.GetType();
  if (element.HasTitle()) {
    cout << " title=\""<< element.GetTitle() << "\"";
  }
  cout << ">";

  int num = element.GetNumKids();
  for (int i=0; i<num; ++i)
  {
    if (element.IsContentItem(i)) {
      Struct::ContentItem cont = element.GetAsContentItem(i);
      if (cont.GetType() == Struct::ContentItem::e_MCID) {
        int page_num = cont.GetPage().GetIndex();
        MCIDDocMap::iterator itr = mcid_doc_map.find(page_num);
        if (itr!=mcid_doc_map.end()) {
          MCIDPageMap& mcid_page_map = itr->second;
          MCIDPageMap::iterator itr2 = mcid_page_map.find(cont.GetMCID());
          if (itr2 != mcid_page_map.end()) {
            cout << itr2->second;
          }
        }
      }
    }
    else { // the kid is another StructElement node.
      ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map,
ident+1);
    }
  }

PrintIdent(ident);
cout << "</" << element.GetType() << ">";
}

int main(int argc, char *argv[])
{
  int ret = 0;
  PDFNet::Initialize();
  PDFNet::SetResourcesPath("../../../resources");

  // Relative path to the folder containing test files.
  string input_path = "../../TestFiles/";
  // string output_path = "../../TestFiles/Output/";

  try // Extract logical structure from a PDF document
  {
    PDFDoc doc((input_path + "tagged.pdf").c_str());
    doc.InitSecurityHandler();

    cout <<
"____________________________________________________________" <<
endl;
    cout << "Sample 1 - Traverse logical structure tree..." << endl;
    {
      Struct::STree tree = doc.GetStructTree();
      if (tree.IsValid()) {
        cout << "Document has a StructTree root." << endl;

        for (int i=0; i<tree.GetNumKids(); ++i) {
          // Recursively get structure info for all all child elements.
          ProcessStructElement(tree.GetKid(i), 0);
        }
      }
      else {
        cout << "This document does not contain any logical structure." <<
endl;
      }
    }
    cout << "\nDone 1." << endl;

    cout <<
"____________________________________________________________" <<
endl;
    cout << "Sample 2 - Get parent logical structure elements from" <<
endl;
    cout << "layout elements." << endl;
    {
      PageIterator end = doc.PageEnd();
      ElementReader reader;
      for (PageIterator itr = doc.PageBegin(); itr!=end; ++itr) {
        reader.Begin(*itr);
        ProcessElements(reader);
        reader.End();
      }
    }
    cout << "\nDone 2." << endl;

    cout <<
"____________________________________________________________" <<
endl;
    cout << "Sample 3 - XML like extraction of PDF logical structure and
page content." << endl;
    {
      MCIDDocMap mcid_doc_map;
      PageIterator end = doc.PageEnd();
      ElementReader reader;
      for (PageIterator itr = doc.PageBegin(); itr!=end; ++itr) {
        reader.Begin(*itr);
        pair<MCIDDocMap::iterator, bool> r =
mcid_doc_map.insert(MCIDDocMap::value_type(itr->GetIndex(),
MCIDPageMap()));
        MCIDPageMap& page_mcid_map = (r.first)->second;
        ProcessElements2(reader, page_mcid_map);
        reader.End();
      }

      Struct::STree tree = doc.GetStructTree();
      if (tree.IsValid()) {
        for (int i=0; i<tree.GetNumKids(); ++i) {
          ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0);
        }
      }
    }
    cout << "\nDone 2." << endl;
  }
  catch(Common::Exception& e)
  {
    cout << e << endl;
    ret = 1;
  }
  catch(...)
  {
    cout << "Unknown Exception" << endl;
    ret = 1;
  }

PDFNet::Terminate();
return ret;
}