Text Highlighting

NOTE — This sample is part of the downloadable distribution

This sample demonstrates how to leverage the text extraction capabilities of PDF Clown in order to get the corresponding coordinates where to place text highlight annotations.

Extracted text is filtered through regular expressions, obtaining its position; then, text occurrences are highlighted creating TextMarkup annotations in their respective pages:

Imports — First of all, let’s declare the imports used in this sample:

import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;
import org.pdfclown.files.File;
import org.pdfclown.tools.TextExtractor;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;

File opening — We begin by opening the PDF file:
```
File file = new File(myFilePath);
```

Preparation — Then we prepare the extraction (in this example choosing the arbitrary regex string “rabbit”):

// Define the text pattern to look for!
String textRegEx = "rabbit";
Pattern pattern = Pattern.compile(textRegEx, Pattern.CASE_INSENSITIVE);

// Instantiate the extractor!
TextExtractor textExtractor = new TextExtractor(true, true);

Text extraction — We iterate the pages extracting text and filtering it through the regex:

for(final Page page : file.getDocument().getPages())
{
  // Extract the page text!
  Map<Rectangle2D,List<ITextString>> textStrings = textExtractor.extract(page);

  // Find the text pattern matches!
  final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));

Text highlighting — Filtered occurrences are highlighted through TextMarkup annotations:

  // Highlight the text pattern matches!
  textExtractor.filter(
    textStrings,
    new TextExtractor.IIntervalFilter(
      )
    {
      @Override
      public boolean hasNext(
        )
      {return matcher.find();}

      @Override
      public Interval next(
        )
      {return new Interval(matcher.start(), matcher.end());}

      @Override
      public void process(
        Interval interval,
        ITextString match
        )
      {
        // Defining the highlight box of the text pattern match...
        List highlightQuads = new ArrayList();
        {
          /*
            NOTE: A text pattern match may be split across multiple contiguous lines,
            so we have to define a distinct highlight box for each text chunk.
          */
          Rectangle2D textBox = null;
          for(TextChar textChar : match.getTextChars())
          {
            Rectangle2D textCharBox = textChar.getBox();
            if(textBox == null)
            {textBox = (Rectangle2D)textCharBox.clone();}
            else
            {
              if(textCharBox.getY() > textBox.getMaxY())
              {
                highlightQuads.add(Quad.get(textBox));
                textBox = (Rectangle2D)textCharBox.clone();
              }
              else
              {textBox.add(textCharBox);}
            }
          }
          highlightQuads.add(Quad.get(textBox));
        }
        // Highlight the text pattern match!
        new TextMarkup(page, null, MarkupTypeEnum.Highlight, highlightQuads);
      }

      @Override
      public void remove(
        )
      {throw new UnsupportedOperationException();}
    }
    );
}