From 2c117998ed7af68b0fec4e676d1b7b464f5f4642 Mon Sep 17 00:00:00 2001 From: ale Date: Thu, 23 Oct 2025 02:30:47 +0200 Subject: [PATCH 1/2] Changed `ParsedText` constructor to properly calculate unscaled witdh for Type0 fonts, method `getWidth` in `DocumentFont.java` failed to match character in the `metrics` map (char was not properly decoded). Using method `ParsedText.getWidth` instead of `ParsedText.getUnscaledTextWidth` when adjusting the `textMatrix` in `displayPdfString` to avoid unnecessary calculations. --- .../openpdf/text/pdf/parser/ParsedText.java | 94 +++++++++---------- .../pdf/parser/PdfContentStreamHandler.java | 6 +- 2 files changed, 48 insertions(+), 52 deletions(-) diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/ParsedText.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/ParsedText.java index 4f7ef4ae1..46e2589d4 100644 --- a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/ParsedText.java +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/ParsedText.java @@ -67,6 +67,35 @@ public class ParsedText extends ParsedTextImpl { */ private PdfString pdfText = null; + static protected ParsedText create(PdfString text, GraphicsState graphicsState, Matrix textMatrix) { + String decoded = ""; + byte[] bytes; + if (BaseFont.IDENTITY_H.equals(graphicsState.getFont().getEncoding())) { + if (graphicsState.getFont().hasUnicodeCMAP()) { + if (graphicsState.getFont().hasTwoByteUnicodeCMAP()) { + text = new PdfString(text.toString(), "IDENTITY_H2"); + } else { + text = new PdfString(text.toString(), "IDENTITY_H1"); + } + } else { + text = new PdfString(new String(text.getBytes(), StandardCharsets.UTF_16)); + } + bytes = text.toString().getBytes(StandardCharsets.UTF_16); + } else { + bytes = text.toString().getBytes(); + } + decoded = graphicsState.getFont().decode(bytes, 0, bytes.length); + char[] chars = decoded.toCharArray(); + float totalWidth = 0; + for (char c : chars) { + float w = graphicsState.getFont().getWidth(c) / 1000.0f; + float wordSpacing = Character.isSpaceChar(c) ? graphicsState.getWordSpacing() : 0f; + float blockWidth = (w * graphicsState.getFontSize() + graphicsState.getCharacterSpacing() + wordSpacing) + * graphicsState.getHorizontalScaling(); + totalWidth += blockWidth; + } + return new ParsedText(text, totalWidth, graphicsState, textMatrix); + } /** * This constructor should only be called when the origin for text display is at (0,0) and the graphical state @@ -76,8 +105,10 @@ public class ParsedText extends ParsedTextImpl { * @param graphicsState graphical state * @param textMatrix transform from text space to graphics (drawing space) */ - ParsedText(PdfString text, GraphicsState graphicsState, Matrix textMatrix) { - this(text, new GraphicsState(graphicsState), textMatrix.multiply(graphicsState.getCtm()), + private ParsedText(PdfString text, float unscaledWidth, GraphicsState graphicsState, + Matrix textMatrix) { + this(text, unscaledWidth, new GraphicsState(graphicsState), + textMatrix.multiply(graphicsState.getCtm()), getUnscaledFontSpaceWidth(graphicsState)); } @@ -85,33 +116,23 @@ public class ParsedText extends ParsedTextImpl { * Internal constructor for a parsed text item. The constructors that call it gather some information from the * graphical state first. * - * @param text This is a PdfString containing code points for the current font, not actually characters. If - * the font has multiByte glyphs, (Identity-H encoding) we reparse the string so that the code - * points don't get split into multiple characters. - * @param graphicsState graphical state - * @param textMatrix transform from text space to graphics (drawing space) - * @param unscaledWidth width of the space character in the font. + * @param text This is a PdfString containing code points for the current font, not actually + * characters. If the font has multiByte glyphs, (Identity-H encoding) we reparse the + * string so that the code points don't get split into multiple characters. + * @param graphicsState graphical state + * @param textMatrix transform from text space to graphics (drawing space) + * @param unscaledSpaceWidth width of the space character in the font. */ - private ParsedText(PdfString text, GraphicsState graphicsState, Matrix textMatrix, float unscaledWidth) { + private ParsedText(PdfString text, float unscaledWidth, GraphicsState graphicsState, + Matrix textMatrix, + float unscaledSpaceWidth) { super(null, pointToUserSpace(0, 0, textMatrix), - pointToUserSpace(getStringWidth(text.toString(), graphicsState), 0f, textMatrix), + pointToUserSpace(unscaledWidth, 0f, textMatrix), pointToUserSpace(1.0f, 0f, textMatrix), convertHeightToUser(graphicsState.getFontAscentDescriptor(), textMatrix), convertHeightToUser(graphicsState.getFontDescentDescriptor(), textMatrix), - convertWidthToUser(unscaledWidth, textMatrix)); - if (BaseFont.IDENTITY_H.equals(graphicsState.getFont().getEncoding())) { - if (graphicsState.getFont().hasUnicodeCMAP()) { - if (graphicsState.getFont().hasTwoByteUnicodeCMAP()) { - pdfText = new PdfString(text.toString(), "IDENTITY_H2"); - } else { - pdfText = new PdfString(text.toString(), "IDENTITY_H1"); - } - } else { - pdfText = new PdfString(new String(text.getBytes(), StandardCharsets.UTF_16)); - } - } else { - pdfText = text; - } + convertWidthToUser(unscaledSpaceWidth, textMatrix)); + pdfText = text; textToUserSpaceTransformMatrix = textMatrix; this.graphicsState = graphicsState; } @@ -199,22 +220,6 @@ private static float convertHeightToUser(float height, return distance(endPos, startPos); } - /** - * Decodes a Java String containing glyph ids encoded in the font's encoding, and determine the unicode equivalent - * - * @param in the String that needs to be decoded - * @return the decoded String - */ - // FIXME unreachable block and default encoding - protected String decode(String in) { - byte[] bytes; - if (BaseFont.IDENTITY_H.equals(graphicsState.getFont().getEncoding())) { - bytes = in.getBytes(StandardCharsets.UTF_16); - } - bytes = in.getBytes(); - return graphicsState.getFont().decode(bytes, 0, bytes.length); - } - /** * This constructor should only be called when the origin for text display is at (0,0) and the graphical state * reflects all transformations of the baseline. This is in text space units. @@ -258,7 +263,6 @@ public List getAsPartialWords() { for (int i = 0; i < chars.length; i++) { char c = chars[i]; float w = font.getWidth(c) / 1000.0f; - if (hasSpace[i]) { if (wordAccum.length() > 0) { result.add(createWord(wordAccum, wordStartOffset, totalWidth, getBaseline(), @@ -339,14 +343,6 @@ private Word createWord(StringBuffer wordAccum, getSingleSpaceWidth(), wordsAreComplete, currentBreakBefore); } - /** - * @param gs graphic state including current transformation to page coordinates from text measurement - * @return the unscaled (i.e. in Text space) width of our text - */ - public float getUnscaledTextWidth(GraphicsState gs) { - return getStringWidth(getFontCodes(), gs); - } - /** * {@inheritDoc} * diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentStreamHandler.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentStreamHandler.java index 8d2a7d16b..8c9eb9f54 100644 --- a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentStreamHandler.java +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentStreamHandler.java @@ -288,11 +288,11 @@ public CMapAwareDocumentFont getCurrentFont() { * @param string the text to display */ void displayPdfString(PdfString string) { - ParsedText renderInfo = new ParsedText(string, graphicsState(), textMatrix); + ParsedText renderInfo = ParsedText.create(string, graphicsState(), textMatrix); if (contextNames.peek() != null) { textFragments.add(renderInfo); } - textMatrix = new Matrix(renderInfo.getUnscaledTextWidth(graphicsState()), 0) + textMatrix = new Matrix(renderInfo.getWidth(), 0) .multiply(textMatrix); } @@ -966,7 +966,7 @@ public void invoke(List operands, PdfContentStreamHandler handler, Pd PdfName subType = stream.getAsName(PdfName.SUBTYPE); if (PdfName.FORM.equals(subType)) { PdfDictionary resources2 = stream.getAsDict(PdfName.RESOURCES); - if (resources2 == null) { + if (resources2 == null) { resources2 = resources; } From f6c14a65d624ef98f323dea481024ffb65d61090 Mon Sep 17 00:00:00 2001 From: ale Date: Thu, 23 Oct 2025 03:17:06 +0200 Subject: [PATCH 2/2] Changed `PdfContentStreamHandler` to abstract class and moved the logic implementation in each class, so that `ContentOperator` is more flexible to use. Implemented a new `PdfContentStreamHandler`:`PdfContentTextLocator` to find and locate the coordinates of a matched regex in the text of a page (basic logic searching inside a `PdfString`, could be extended to group `PdfString` in the same line). --- .../text/pdf/parser/MatchedPattern.java | 97 ++ .../pdf/parser/PdfContentStreamHandler.java | 846 +-------------- .../pdf/parser/PdfContentTextExtractor.java | 953 +++++++++++++++++ .../pdf/parser/PdfContentTextLocator.java | 966 ++++++++++++++++++ .../text/pdf/parser/PdfTextExtractor.java | 2 +- .../text/pdf/parser/PdfTextLocator.java | 227 ++++ 6 files changed, 2260 insertions(+), 831 deletions(-) create mode 100644 openpdf-core/src/main/java/org/openpdf/text/pdf/parser/MatchedPattern.java create mode 100644 openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextExtractor.java create mode 100644 openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextLocator.java create mode 100644 openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextLocator.java diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/MatchedPattern.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/MatchedPattern.java new file mode 100644 index 000000000..7d062d59e --- /dev/null +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/MatchedPattern.java @@ -0,0 +1,97 @@ +/* + * Copyright 2008 by Kevin Day. + * + * The contents of this file are subject to the Mozilla Public License Version 1.1 + * (the "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the License. + * + * The Original Code is 'iText, a free JAVA-PDF library'. + * + * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by + * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie. + * All Rights Reserved. + * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer + * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved. + * + * Contributor(s): all the names of the contributors are added in the source code + * where applicable. + * + * Alternatively, the contents of this file may be used under the terms of the + * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the + * provisions of LGPL are applicable instead of those above. If you wish to + * allow use of your version of this file only under the terms of the LGPL + * License and not to allow others to use your version of this file under + * the MPL, indicate your decision by deleting the provisions above and + * replace them with the notice and other provisions required by the LGPL. + * If you do not delete the provisions above, a recipient may use your version + * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of the MPL as stated above or under the terms of the GNU + * Library General Public License as published by the Free Software Foundation; + * either version 2 of the License, or any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more + * details. + * + * If you didn't download this code from the following link, you should check if + * you aren't using an obsolete version: + * https://github.com/LibrePDF/OpenPDF + */ +package org.openpdf.text.pdf.parser; + +public class MatchedPattern { + + private final String text; + private final int page; + private final float[] coordinates = new float[4]; + + /** + * Constructor to pair a strip of text with its bounding box coordinates inside a page. + * The coordinates system has the origin (0, 0) in the lower left point of the page + * and uses PDF points as unit measure. + * + * @param text string + * @param page int + * @param llx float lower left x coordinate + * @param lly float lower left y coordinate + * @param urx float upper right x coordinate + * @param ury float upper right y coordinate + */ + MatchedPattern(String text, int page, float llx, float lly, float urx, float ury) { + this.text = text; + this.page = page; + coordinates[0] = llx; + coordinates[1] = lly; + coordinates[2] = urx; + coordinates[3] = ury; + } + + public String getText() { + return text; + } + + public int getPage() { + return page; + } + + public float[] getCoordinates() { + return coordinates; + } + + @Override + public String toString() { + String[] c = new String[4]; + for(int i = 0; i < 4; i++) { + c[i] = String.valueOf(coordinates[i]); + } + return "[" + String.join(", ", c) + "]"; + } + +} diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentStreamHandler.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentStreamHandler.java index 8c9eb9f54..30b162b68 100644 --- a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentStreamHandler.java +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentStreamHandler.java @@ -41,29 +41,15 @@ LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the */ package org.openpdf.text.pdf.parser; -import org.openpdf.text.ExceptionConverter; import org.openpdf.text.error_messages.MessageLocalization; import org.openpdf.text.pdf.CMapAwareDocumentFont; -import org.openpdf.text.pdf.PRIndirectReference; -import org.openpdf.text.pdf.PRStream; -import org.openpdf.text.pdf.PRTokeniser; -import org.openpdf.text.pdf.PdfArray; -import org.openpdf.text.pdf.PdfContentParser; import org.openpdf.text.pdf.PdfDictionary; -import org.openpdf.text.pdf.PdfIndirectReference; import org.openpdf.text.pdf.PdfLiteral; -import org.openpdf.text.pdf.PdfName; import org.openpdf.text.pdf.PdfNumber; import org.openpdf.text.pdf.PdfObject; -import org.openpdf.text.pdf.PdfReader; -import org.openpdf.text.pdf.PdfStream; import org.openpdf.text.pdf.PdfString; -import java.io.ByteArrayOutputStream; -import java.io.IOException; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Optional; import java.util.Stack; @@ -72,14 +58,14 @@ LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the * @author dgd */ @SuppressWarnings({"WeakerAccess", "unused"}) -public class PdfContentStreamHandler { +public abstract class PdfContentStreamHandler { - private final Stack> textFragmentStreams = new Stack<>(); - private final Stack contextNames = new Stack<>(); + protected final Stack> textFragmentStreams = new Stack<>(); + protected final Stack contextNames = new Stack<>(); /** * detail parser for text within a marked section. used by TextAssembler */ - private final TextAssembler renderListener; + protected final TextAssembler renderListener; /** * A map with all supported operators operators (PDF syntax). */ @@ -87,22 +73,20 @@ public class PdfContentStreamHandler { /** * Stack keeping track of the graphics state. */ - private Stack gsStack; + protected Stack gsStack; /** * Text matrix. */ - private Matrix textMatrix; + protected Matrix textMatrix; /** * Text line matrix. */ - private Matrix textLineMatrix; - private List textFragments = new ArrayList<>(); + protected Matrix textLineMatrix; + protected List textFragments = new ArrayList<>(); public PdfContentStreamHandler(TextAssembler renderListener) { this.renderListener = renderListener; - installDefaultOperators(); - reset(); } private static Matrix getMatrix(List operands) { @@ -135,51 +119,7 @@ public void registerContentOperator(ContentOperator operator) { /** * Loads all the supported graphics and text state operators in a map. */ - protected void installDefaultOperators() { - operators = new HashMap<>(); - - registerContentOperator(new PdfContentStreamHandler.PushGraphicsState()); - registerContentOperator(new PdfContentStreamHandler.PopGraphicsState()); - registerContentOperator(new PdfContentStreamHandler.ModifyCurrentTransformationMatrix()); - registerContentOperator(new PdfContentStreamHandler.ProcessGraphicsStateResource()); - - PdfContentStreamHandler.SetTextCharacterSpacing tcOperator = new PdfContentStreamHandler.SetTextCharacterSpacing(); - registerContentOperator(tcOperator); - PdfContentStreamHandler.SetTextWordSpacing twOperator = new PdfContentStreamHandler.SetTextWordSpacing(); - registerContentOperator(twOperator); - registerContentOperator(new PdfContentStreamHandler.SetTextHorizontalScaling()); - PdfContentStreamHandler.SetTextLeading tlOperator = new PdfContentStreamHandler.SetTextLeading(); - registerContentOperator(tlOperator); - registerContentOperator(new PdfContentStreamHandler.SetTextFont()); - registerContentOperator(new PdfContentStreamHandler.SetTextRenderMode()); - registerContentOperator(new PdfContentStreamHandler.SetTextRise()); - - registerContentOperator(new PdfContentStreamHandler.BeginText()); - registerContentOperator(new PdfContentStreamHandler.EndText()); - - PdfContentStreamHandler.TextMoveStartNextLine tdOperator = new PdfContentStreamHandler.TextMoveStartNextLine(); - registerContentOperator(tdOperator); - registerContentOperator(new PdfContentStreamHandler.TextMoveStartNextLineWithLeading(tdOperator, tlOperator)); - registerContentOperator(new PdfContentStreamHandler.TextSetTextMatrix()); - PdfContentStreamHandler.TextMoveNextLine tstarOperator = - new PdfContentStreamHandler.TextMoveNextLine(tdOperator); - registerContentOperator(tstarOperator); - - PdfContentStreamHandler.ShowText tjOperator = new PdfContentStreamHandler.ShowText(); - registerContentOperator(new PdfContentStreamHandler.ShowText()); - PdfContentStreamHandler.MoveNextLineAndShowText tickOperator = - new PdfContentStreamHandler.MoveNextLineAndShowText(tstarOperator, tjOperator); - registerContentOperator(tickOperator); - registerContentOperator( - new PdfContentStreamHandler.MoveNextLineAndShowTextWithSpacing(twOperator, tcOperator, tickOperator)); - registerContentOperator(new PdfContentStreamHandler.ShowTextArray()); - // marked sections - registerContentOperator(new BeginMarked()); - registerContentOperator(new BeginMarkedDict()); - registerContentOperator(new EndMarked()); - - registerContentOperator(new Do()); - } + protected abstract void installDefaultOperators(); /** * Get the operator to process a command with a given name @@ -187,9 +127,7 @@ protected void installDefaultOperators() { * @param operatorName name of the operator that we might need to call * @return the operator or null if none present */ - public Optional lookupOperator(String operatorName) { - return Optional.ofNullable(operators.get(operatorName)); - } + protected abstract Optional lookupOperator(String operatorName); /** * Invokes an operator. @@ -198,34 +136,11 @@ public Optional lookupOperator(String operatorName) { * @param operands a list with operands * @param resources Pdf Resources found in the file containing the stream. */ - public void invokeOperator(PdfLiteral operator, List operands, PdfDictionary resources) { - String operatorName = operator.toString(); - lookupOperator(operatorName) - .ifPresent(contentOperator -> contentOperator.invoke(operands, this, resources)); - } - - void popContext() { - String contextName = contextNames.pop(); - List newBuffer = textFragmentStreams.pop(); - // put together set of unparsed text fragments - renderListener.reset(); - for (TextAssemblyBuffer fragment : textFragments) { - fragment.accumulate(renderListener, contextName); - } - FinalText contextResult = renderListener.endParsingContext(contextName); - Optional.ofNullable(contextResult) - .map(FinalText::getText) - .filter(text -> !text.isEmpty()) - .ifPresent(text -> newBuffer.add(contextResult)); + public abstract void invokeOperator(PdfLiteral operator, List operands, PdfDictionary resources); - textFragments = newBuffer; - } + abstract void popContext(); - void pushContext(String newContextName) { - contextNames.push(newContextName); - textFragmentStreams.push(textFragments); - textFragments = new ArrayList<>(); - } + abstract void pushContext(String newContextName); /** * Returns the current graphics state. @@ -236,14 +151,7 @@ GraphicsState graphicsState() { return gsStack.peek(); } - public void reset() { - if (gsStack == null || gsStack.isEmpty()) { - gsStack = new Stack<>(); - } - gsStack.add(new GraphicsState()); - textMatrix = null; - textLineMatrix = null; - } + public abstract void reset(); /** * Returns the current text matrix. @@ -287,732 +195,10 @@ public CMapAwareDocumentFont getCurrentFont() { * * @param string the text to display */ - void displayPdfString(PdfString string) { - ParsedText renderInfo = ParsedText.create(string, graphicsState(), textMatrix); - if (contextNames.peek() != null) { - textFragments.add(renderInfo); - } - textMatrix = new Matrix(renderInfo.getWidth(), 0) - .multiply(textMatrix); - } + abstract void displayPdfString(PdfString string); /** * @return result text */ - public String getResultantText() { - if (contextNames.size() > 0) { - throw new RuntimeException("can't get text with unprocessed stack items"); - } - StringBuilder res = new StringBuilder(); - for (TextAssemblyBuffer fragment : textFragments) { - res.append(fragment.getText()); - } - return res.toString().trim(); - } - - /** - * A content operator implementation (TJ). - */ - static class ShowTextArray implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "TJ"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfArray array = (PdfArray) operands.get(0); - for (PdfObject entryObj : array.getElements()) { - if (entryObj instanceof PdfString) { - handler.displayPdfString((PdfString) entryObj); - } else { - float tj = ((PdfNumber) entryObj).floatValue(); - handler.applyTextAdjust(tj); - } - } - - } - } - - /** - * A content operator implementation (BT). - */ - static class BeginText implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "BT"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - handler.textMatrix = new Matrix(); - handler.textLineMatrix = handler.textMatrix; - } - } - - /** - * A content operator implementation (ET). - */ - static class EndText implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "ET"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - handler.textMatrix = null; - handler.textLineMatrix = null; - } - } - - /** - * A content operator implementation (cm). - */ - static class ModifyCurrentTransformationMatrix implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "cm"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - Matrix matrix = getMatrix(operands); - GraphicsState graphicsState = handler.gsStack.peek(); - graphicsState.multiplyCtm(matrix); - } - } - - /** - * A content operator implementation ('). - */ - static class MoveNextLineAndShowText implements ContentOperator { - - private final PdfContentStreamHandler.TextMoveNextLine textMoveNextLine; - private final PdfContentStreamHandler.ShowText showText; - - public MoveNextLineAndShowText( - PdfContentStreamHandler.TextMoveNextLine textMoveNextLine, - PdfContentStreamHandler.ShowText showText) { - this.textMoveNextLine = textMoveNextLine; - this.showText = showText; - } - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "'"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - textMoveNextLine.invoke(new ArrayList<>(0), handler, resources); - showText.invoke(operands, handler, resources); - } - } - - /** - * A content operator implementation ("). - */ - static class MoveNextLineAndShowTextWithSpacing implements ContentOperator { - - private final PdfContentStreamHandler.SetTextWordSpacing setTextWordSpacing; - private final PdfContentStreamHandler.SetTextCharacterSpacing setTextCharacterSpacing; - private final MoveNextLineAndShowText moveNextLineAndShowText; - - public MoveNextLineAndShowTextWithSpacing( - PdfContentStreamHandler.SetTextWordSpacing setTextWordSpacing, - PdfContentStreamHandler.SetTextCharacterSpacing setTextCharacterSpacing, - MoveNextLineAndShowText moveNextLineAndShowText) { - this.setTextWordSpacing = setTextWordSpacing; - this.setTextCharacterSpacing = setTextCharacterSpacing; - this.moveNextLineAndShowText = moveNextLineAndShowText; - } - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "\""; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfNumber aw = (PdfNumber) operands.get(0); - PdfNumber ac = (PdfNumber) operands.get(1); - PdfString string = (PdfString) operands.get(2); - - List twOperands = new ArrayList<>(1); - twOperands.add(0, aw); - setTextWordSpacing.invoke(twOperands, handler, resources); - - List tcOperands = new ArrayList<>(1); - tcOperands.add(0, ac); - setTextCharacterSpacing.invoke(tcOperands, handler, resources); - - List tickOperands = new ArrayList<>(1); - tickOperands.add(0, string); - moveNextLineAndShowText.invoke(tickOperands, handler, resources); - } - } - - /** - * A content operator implementation (Q). - */ - static class PopGraphicsState implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "Q"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - handler.gsStack.pop(); - } - } - - /** - * A content operator implementation (gs). - */ - static class ProcessGraphicsStateResource implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "gs"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfName dictionaryName = (PdfName) operands.get(0); - PdfDictionary extGState = resources.getAsDict(PdfName.EXTGSTATE); - if (extGState == null) { - throw new IllegalArgumentException( - MessageLocalization.getComposedMessage( - "resources.do.not.contain.extgstate.entry.unable.to.process.operator.1", - getOperatorName())); - } - PdfDictionary gsDic = extGState.getAsDict(dictionaryName); - if (gsDic == null) { - throw new IllegalArgumentException(MessageLocalization.getComposedMessage( - "1.is.an.unknown.graphics.state.dictionary", dictionaryName)); - } - - // at this point, all we care about is the FONT entry in the GS - // dictionary - PdfArray fontParameter = gsDic.getAsArray(PdfName.FONT); - if (fontParameter != null) { - PdfObject pdfObject = fontParameter.getPdfObject(0); - CMapAwareDocumentFont font = new CMapAwareDocumentFont((PRIndirectReference) pdfObject); - float size = fontParameter.getAsNumber(1).floatValue(); - - handler.graphicsState().setFont(font); - handler.graphicsState().setFontSize(size); - } - } - } - - /** - * A content operator implementation (q). - */ - static class PushGraphicsState implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "q"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - GraphicsState gs = handler.gsStack.peek(); - GraphicsState copy = new GraphicsState(gs); - handler.gsStack.push(copy); - } - } - - /** - * A content operator implementation (Tc). - */ - static class SetTextCharacterSpacing implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "Tc"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfNumber charSpace = (PdfNumber) operands.get(0); - handler.graphicsState().setCharacterSpacing(charSpace.floatValue()); - } - } - - /** - * A content operator implementation (Tf). - */ - static class SetTextFont implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "Tf"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfName fontResourceName = (PdfName) operands.get(0); - float size = ((PdfNumber) operands.get(1)).floatValue(); - - PdfDictionary fontsDictionary = resources.getAsDict(PdfName.FONT); - PdfObject pdfObject = fontsDictionary.get(fontResourceName); - CMapAwareDocumentFont font = new CMapAwareDocumentFont((PRIndirectReference) pdfObject); - - handler.graphicsState().setFont(font); - handler.graphicsState().setFontSize(size); - } - } - - /** - * A content operator implementation (Tm). - */ - static class TextSetTextMatrix implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "Tm"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - handler.textLineMatrix = getMatrix(operands); - handler.textMatrix = handler.textLineMatrix; - } - } - - /** - * A content operator implementation (TD). - */ - static class TextMoveStartNextLineWithLeading implements ContentOperator { - - private final PdfContentStreamHandler.TextMoveStartNextLine moveStartNextLine; - - private final PdfContentStreamHandler.SetTextLeading setTextLeading; - - public TextMoveStartNextLineWithLeading( - PdfContentStreamHandler.TextMoveStartNextLine moveStartNextLine, - PdfContentStreamHandler.SetTextLeading setTextLeading) { - this.moveStartNextLine = moveStartNextLine; - this.setTextLeading = setTextLeading; - } - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "TD"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - float ty = ((PdfNumber) operands.get(1)).floatValue(); - - List tlOperands = new ArrayList<>(1); - tlOperands.add(0, new PdfNumber(-ty)); - setTextLeading.invoke(tlOperands, handler, resources); - moveStartNextLine.invoke(operands, handler, resources); - } - } - - /** - * A content operator implementation (Tj). - */ - static class ShowText implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "Tj"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfString string = (PdfString) operands.get(0); - handler.displayPdfString(string); - } - } - - /** - * A content operator implementation (T*). - */ - static class TextMoveNextLine implements ContentOperator { - - private final TextMoveStartNextLine moveStartNextLine; - - public TextMoveNextLine(TextMoveStartNextLine moveStartNextLine) { - this.moveStartNextLine = moveStartNextLine; - } - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "T*"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - List tdoperands = new ArrayList<>(2); - tdoperands.add(0, new PdfNumber(0)); - tdoperands.add(1, new PdfNumber(-handler.graphicsState().getLeading())); - moveStartNextLine.invoke(tdoperands, handler, resources); - } - } - - /** - * A content operator implementation (Td). - */ - static class TextMoveStartNextLine implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "Td"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - float tx = ((PdfNumber) operands.get(0)).floatValue(); - float ty = ((PdfNumber) operands.get(1)).floatValue(); - - Matrix translationMatrix = new Matrix(tx, ty); - handler.textMatrix = translationMatrix.multiply(handler.textLineMatrix); - handler.textLineMatrix = handler.textMatrix; - } - } - - /** - * A content operator implementation (Tr). - */ - static class SetTextRenderMode implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "Tr"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfNumber render = (PdfNumber) operands.get(0); - handler.graphicsState().setRenderMode(render.intValue()); - } - } - - /** - * A content operator implementation (Ts). - */ - static class SetTextRise implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "Ts"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfNumber rise = (PdfNumber) operands.get(0); - handler.graphicsState().setRise(rise.floatValue()); - } - } - - /** - * A content operator implementation (TL). - */ - static class SetTextLeading implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "TL"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfNumber leading = (PdfNumber) operands.get(0); - handler.graphicsState().setLeading(leading.floatValue()); - } - } - - /** - * A content operator implementation (Tz). - */ - static class SetTextHorizontalScaling implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "Tz"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfNumber scale = (PdfNumber) operands.get(0); - handler.graphicsState().setHorizontalScaling(scale.floatValue()); - } - } - - /** - * A content operator implementation (Tw). - */ - static class SetTextWordSpacing implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "Tw"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfNumber wordSpace = (PdfNumber) operands.get(0); - handler.graphicsState().setWordSpacing(wordSpace.floatValue()); - } - } - - /** - * A content operator implementation (BMC). - */ - private static class BeginMarked implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "BMC"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfName tagName = (PdfName) operands.get(0); - String realName = tagName.toString().substring(1).toLowerCase(Locale.ROOT); - if ("artifact".equals(realName) || "placedpdf".equals(realName)) { - handler.pushContext(null); - } else { - handler.pushContext(realName); - } - } - - } - - /** - * A content operator implementation (BDC). - */ - private static class BeginMarkedDict implements ContentOperator { - - /** - * The BDC marked-content operator which brackets a marked-content sequence of objects within the content - * stream. - * - * @param operands list of operands - * @param resources dictionary - * @return PdfDictionary of type BDC marked-content - */ - private static PdfDictionary getBDCDictionary(List operands, PdfDictionary resources) { - PdfObject pdfObject = operands.get(1); - if (pdfObject.isName()) { - PdfDictionary properties = resources.getAsDict(PdfName.PROPERTIES); - PdfIndirectReference ir = properties.getAsIndirectObject((PdfName) pdfObject); - if (ir != null) { - pdfObject = ir.getIndRef(); - } else { - pdfObject = properties.getAsDict((PdfName) pdfObject); - } - } - return (PdfDictionary) pdfObject; - } - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "BDC"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfObject firstOperand = operands.get(0); - String tagName = firstOperand.toString().substring(1).toLowerCase(Locale.ROOT); - if ("artifact".equals(tagName) || "placedpdf".equals(tagName) - || handler.contextNames.peek() == null) { - tagName = null; - } else if ("l".equals(tagName)) { - tagName = "ul"; - } - PdfDictionary attrs = getBDCDictionary(operands, resources); - if (attrs != null && tagName != null) { - PdfString alternateText = attrs.getAsString(PdfName.E); - if (alternateText != null) { - handler.pushContext(tagName); - handler.textFragments - .add(new FinalText(alternateText.toString())); - handler.popContext(); - // ignore rest of the content of this element - handler.pushContext(null); - return; - } else if (attrs.get(PdfName.TYPE) != null) { - // ignore tag for non-tag marked content that sometimes - // shows up. - tagName = ""; - } - } - handler.pushContext(tagName); - } - } - - /** - * A content operator implementation (EMC). - */ - private static class EndMarked implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "EMC"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - handler.popContext(); - } - } - - private class Do implements ContentOperator { - - /** - * @see org.openpdf.text.pdf.parser.ContentOperator#getOperatorName() - */ - @Override - public String getOperatorName() { - return "Do"; - } - - @Override - public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { - PdfObject firstOperand = operands.get(0); - if (firstOperand instanceof PdfName) { - PdfName name = (PdfName) firstOperand; - PdfDictionary dictionary = resources.getAsDict(PdfName.XOBJECT); - if (dictionary == null) { - return; - } - PdfStream stream = (PdfStream) dictionary.getDirectObject(name); - PdfName subType = stream.getAsName(PdfName.SUBTYPE); - if (PdfName.FORM.equals(subType)) { - PdfDictionary resources2 = stream.getAsDict(PdfName.RESOURCES); - if (resources2 == null) { - resources2 = resources; - } - - byte[] data; - try { - data = getContentBytesFromPdfObject(stream); - } catch (IOException ex) { - throw new ExceptionConverter(ex); - } - new PushGraphicsState().invoke(operands, handler, resources); - processContent(data, resources2); - new PopGraphicsState().invoke(operands, handler, resources); - } - } - - } - - private void processContent(byte[] contentBytes, PdfDictionary resources) { - try { - PdfContentParser pdfContentParser = new PdfContentParser(new PRTokeniser(contentBytes)); - List operands = new ArrayList<>(); - while (!pdfContentParser.parse(operands).isEmpty()) { - PdfLiteral operator = (PdfLiteral) operands.get(operands.size() - 1); - invokeOperator(operator, operands, resources); - } - } catch (Exception e) { - throw new ExceptionConverter(e); - } - } - - - private byte[] getContentBytesFromPdfObject(PdfObject object) throws IOException { - switch (object.type()) { - case PdfObject.INDIRECT: - return getContentBytesFromPdfObject(PdfReader.getPdfObject(object)); - case PdfObject.STREAM: - return PdfReader.getStreamBytes((PRStream) PdfReader.getPdfObject(object)); - case PdfObject.ARRAY: - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - for (PdfObject element : ((PdfArray) object).getElements()) { - baos.write(getContentBytesFromPdfObject(element)); - } - return baos.toByteArray(); - default: - throw new IllegalStateException("Unsupported type: " + object.getClass().getCanonicalName()); - } - } - } + public abstract String getResultantText(); } diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextExtractor.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextExtractor.java new file mode 100644 index 000000000..871b1d29b --- /dev/null +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextExtractor.java @@ -0,0 +1,953 @@ +/* + Copyright 2014 by Tizra Inc. + The contents of this file are subject to the Mozilla Public License Version 1.1 + (the "License"); you may not use this file except in compliance with the License. + You may obtain a copy of the License at http://www.mozilla.org/MPL/ + + Software distributed under the License is distributed on an "AS IS" basis, + WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + for the specific language governing rights and limitations under the License. + + The Original Code is 'iText, a free JAVA-PDF library'. + + The Initial Developer of the Original Code is Bruno Lowagie. Portions created by + the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie. + All Rights Reserved. + Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer + are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved. + + Contributor(s): all the names of the contributors are added in the source code + where applicable. + + Alternatively, the contents of this file may be used under the terms of the + LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the + provisions of LGPL are applicable instead of those above. If you wish to + allow use of your version of this file only under the terms of the LGPL + License and not to allow others to use your version of this file under + the MPL, indicate your decision by deleting the provisions above and + replace them with the notice and other provisions required by the LGPL. + If you do not delete the provisions above, a recipient may use your version + of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. + + This library is free software; you can redistribute it and/or modify it + under the terms of the MPL as stated above or under the terms of the GNU + Library General Public License as published by the Free Software Foundation; + either version 2 of the License, or any later version. + + This library is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more + details. + */ +package org.openpdf.text.pdf.parser; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.Stack; +import org.openpdf.text.ExceptionConverter; +import org.openpdf.text.error_messages.MessageLocalization; +import org.openpdf.text.pdf.CMapAwareDocumentFont; +import org.openpdf.text.pdf.PRIndirectReference; +import org.openpdf.text.pdf.PRStream; +import org.openpdf.text.pdf.PRTokeniser; +import org.openpdf.text.pdf.PdfArray; +import org.openpdf.text.pdf.PdfContentParser; +import org.openpdf.text.pdf.PdfDictionary; +import org.openpdf.text.pdf.PdfIndirectReference; +import org.openpdf.text.pdf.PdfLiteral; +import org.openpdf.text.pdf.PdfName; +import org.openpdf.text.pdf.PdfNumber; +import org.openpdf.text.pdf.PdfObject; +import org.openpdf.text.pdf.PdfReader; +import org.openpdf.text.pdf.PdfStream; +import org.openpdf.text.pdf.PdfString; + +/** + * @author dgd + */ +@SuppressWarnings({"WeakerAccess", "unused"}) +public class PdfContentTextExtractor extends PdfContentStreamHandler { + + /** + * A map with all supported operators operators (PDF syntax). + */ + private Map operators; + + public PdfContentTextExtractor(TextAssembler renderListener) { + super(renderListener); + installDefaultOperators(); + reset(); + } + + private static Matrix getMatrix(List operands) { + float a = ((PdfNumber) operands.get(0)).floatValue(); + float b = ((PdfNumber) operands.get(1)).floatValue(); + float c = ((PdfNumber) operands.get(2)).floatValue(); + float d = ((PdfNumber) operands.get(3)).floatValue(); + float e = ((PdfNumber) operands.get(4)).floatValue(); + float f = ((PdfNumber) operands.get(5)).floatValue(); + return new Matrix(a, b, c, d, e, f); + } + + /** + * Registers a content operator that will be called when the specified operator string is encountered during content + * processing. Each operator may be registered only once (it is not legal to have multiple operators with the same + * operatorString) + * + * @param operator the operator that will receive notification when the operator is encountered + * @since 2.1.7 + */ + public void registerContentOperator(ContentOperator operator) { + String operatorString = operator.getOperatorName(); + if (operators.containsKey(operatorString)) { + throw new IllegalArgumentException(MessageLocalization.getComposedMessage( + "operator.1.already.registered", operatorString)); + } + operators.put(operatorString, operator); + } + + /** + * Loads all the supported graphics and text state operators in a map. + */ + protected void installDefaultOperators() { + operators = new HashMap<>(); + + registerContentOperator(new PushGraphicsState()); + registerContentOperator(new PopGraphicsState()); + registerContentOperator(new ModifyCurrentTransformationMatrix()); + registerContentOperator(new ProcessGraphicsStateResource()); + + SetTextCharacterSpacing tcOperator = new SetTextCharacterSpacing(); + registerContentOperator(tcOperator); + SetTextWordSpacing twOperator = new SetTextWordSpacing(); + registerContentOperator(twOperator); + registerContentOperator(new SetTextHorizontalScaling()); + SetTextLeading tlOperator = new SetTextLeading(); + registerContentOperator(tlOperator); + registerContentOperator(new SetTextFont()); + registerContentOperator(new SetTextRenderMode()); + registerContentOperator(new SetTextRise()); + + registerContentOperator(new BeginText()); + registerContentOperator(new EndText()); + + TextMoveStartNextLine tdOperator = new TextMoveStartNextLine(); + registerContentOperator(tdOperator); + registerContentOperator(new TextMoveStartNextLineWithLeading(tdOperator, tlOperator)); + registerContentOperator(new TextSetTextMatrix()); + TextMoveNextLine tstarOperator = + new TextMoveNextLine(tdOperator); + registerContentOperator(tstarOperator); + + ShowText tjOperator = new ShowText(); + registerContentOperator(new ShowText()); + MoveNextLineAndShowText tickOperator = + new MoveNextLineAndShowText(tstarOperator, tjOperator); + registerContentOperator(tickOperator); + registerContentOperator( + new MoveNextLineAndShowTextWithSpacing(twOperator, tcOperator, tickOperator)); + registerContentOperator(new ShowTextArray()); + // marked sections + registerContentOperator(new BeginMarked()); + registerContentOperator(new BeginMarkedDict()); + registerContentOperator(new EndMarked()); + + registerContentOperator(new Do()); + } + + /** + * Get the operator to process a command with a given name + * + * @param operatorName name of the operator that we might need to call + * @return the operator or null if none present + */ + public Optional lookupOperator(String operatorName) { + return Optional.ofNullable(operators.get(operatorName)); + } + + /** + * Invokes an operator. + * + * @param operator the PDF Syntax of the operator + * @param operands a list with operands + * @param resources Pdf Resources found in the file containing the stream. + */ + public void invokeOperator(PdfLiteral operator, List operands, PdfDictionary resources) { + String operatorName = operator.toString(); + lookupOperator(operatorName) + .ifPresent(contentOperator -> contentOperator.invoke(operands, this, resources)); + } + + void popContext() { + String contextName = contextNames.pop(); + List newBuffer = textFragmentStreams.pop(); + // put together set of unparsed text fragments + renderListener.reset(); + for (TextAssemblyBuffer fragment : textFragments) { + fragment.accumulate(renderListener, contextName); + } + + FinalText contextResult = renderListener.endParsingContext(contextName); + Optional.ofNullable(contextResult) + .map(FinalText::getText) + .filter(text -> !text.isEmpty()) + .ifPresent(text -> newBuffer.add(contextResult)); + + textFragments = newBuffer; + } + + void pushContext(String newContextName) { + contextNames.push(newContextName); + textFragmentStreams.push(textFragments); + textFragments = new ArrayList<>(); + } + + public void reset() { + if (gsStack == null || gsStack.isEmpty()) { + gsStack = new Stack<>(); + } + gsStack.add(new GraphicsState()); + textMatrix = null; + textLineMatrix = null; + } + + /** + * Displays text. + * + * @param string the text to display + */ + void displayPdfString(PdfString string) { + ParsedText renderInfo = ParsedText.create(string, graphicsState(), textMatrix); + if (contextNames.peek() != null) { + textFragments.add(renderInfo); + } + textMatrix = new Matrix(renderInfo.getWidth(), 0) + .multiply(textMatrix); + } + + /** + * @return result text + */ + public String getResultantText() { + if (contextNames.size() > 0) { + throw new RuntimeException("can't get text with unprocessed stack items"); + } + StringBuilder res = new StringBuilder(); + for (TextAssemblyBuffer fragment : textFragments) { + res.append(fragment.getText()); + } + return res.toString().trim(); + } + + /** + * A content operator implementation (TJ). + */ + static class ShowTextArray implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "TJ"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfArray array = (PdfArray) operands.get(0); + for (PdfObject entryObj : array.getElements()) { + if (entryObj instanceof PdfString) { + handler.displayPdfString((PdfString) entryObj); + } else { + float tj = ((PdfNumber) entryObj).floatValue(); + handler.applyTextAdjust(tj); + } + } + + } + } + + /** + * A content operator implementation (BT). + */ + static class BeginText implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "BT"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + handler.textMatrix = new Matrix(); + handler.textLineMatrix = handler.textMatrix; + } + } + + /** + * A content operator implementation (ET). + */ + static class EndText implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "ET"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + handler.textMatrix = null; + handler.textLineMatrix = null; + } + } + + /** + * A content operator implementation (cm). + */ + static class ModifyCurrentTransformationMatrix implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "cm"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + Matrix matrix = getMatrix(operands); + GraphicsState graphicsState = handler.gsStack.peek(); + graphicsState.multiplyCtm(matrix); + } + } + + /** + * A content operator implementation ('). + */ + static class MoveNextLineAndShowText implements ContentOperator { + + private final TextMoveNextLine textMoveNextLine; + private final ShowText showText; + + public MoveNextLineAndShowText( + TextMoveNextLine textMoveNextLine, + ShowText showText) { + this.textMoveNextLine = textMoveNextLine; + this.showText = showText; + } + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "'"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + textMoveNextLine.invoke(new ArrayList<>(0), handler, resources); + showText.invoke(operands, handler, resources); + } + } + + /** + * A content operator implementation ("). + */ + static class MoveNextLineAndShowTextWithSpacing implements ContentOperator { + + private final SetTextWordSpacing setTextWordSpacing; + private final SetTextCharacterSpacing setTextCharacterSpacing; + private final MoveNextLineAndShowText moveNextLineAndShowText; + + public MoveNextLineAndShowTextWithSpacing( + SetTextWordSpacing setTextWordSpacing, + SetTextCharacterSpacing setTextCharacterSpacing, + MoveNextLineAndShowText moveNextLineAndShowText) { + this.setTextWordSpacing = setTextWordSpacing; + this.setTextCharacterSpacing = setTextCharacterSpacing; + this.moveNextLineAndShowText = moveNextLineAndShowText; + } + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "\""; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber aw = (PdfNumber) operands.get(0); + PdfNumber ac = (PdfNumber) operands.get(1); + PdfString string = (PdfString) operands.get(2); + + List twOperands = new ArrayList<>(1); + twOperands.add(0, aw); + setTextWordSpacing.invoke(twOperands, handler, resources); + + List tcOperands = new ArrayList<>(1); + tcOperands.add(0, ac); + setTextCharacterSpacing.invoke(tcOperands, handler, resources); + + List tickOperands = new ArrayList<>(1); + tickOperands.add(0, string); + moveNextLineAndShowText.invoke(tickOperands, handler, resources); + } + } + + /** + * A content operator implementation (Q). + */ + static class PopGraphicsState implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Q"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + handler.gsStack.pop(); + } + } + + /** + * A content operator implementation (gs). + */ + static class ProcessGraphicsStateResource implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "gs"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfName dictionaryName = (PdfName) operands.get(0); + PdfDictionary extGState = resources.getAsDict(PdfName.EXTGSTATE); + if (extGState == null) { + throw new IllegalArgumentException( + MessageLocalization.getComposedMessage( + "resources.do.not.contain.extgstate.entry.unable.to.process.operator.1", + getOperatorName())); + } + PdfDictionary gsDic = extGState.getAsDict(dictionaryName); + if (gsDic == null) { + throw new IllegalArgumentException(MessageLocalization.getComposedMessage( + "1.is.an.unknown.graphics.state.dictionary", dictionaryName)); + } + + // at this point, all we care about is the FONT entry in the GS + // dictionary + PdfArray fontParameter = gsDic.getAsArray(PdfName.FONT); + if (fontParameter != null) { + PdfObject pdfObject = fontParameter.getPdfObject(0); + CMapAwareDocumentFont font = new CMapAwareDocumentFont((PRIndirectReference) pdfObject); + float size = fontParameter.getAsNumber(1).floatValue(); + + handler.graphicsState().setFont(font); + handler.graphicsState().setFontSize(size); + } + } + } + + /** + * A content operator implementation (q). + */ + static class PushGraphicsState implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "q"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + GraphicsState gs = handler.gsStack.peek(); + GraphicsState copy = new GraphicsState(gs); + handler.gsStack.push(copy); + } + } + + /** + * A content operator implementation (Tc). + */ + static class SetTextCharacterSpacing implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tc"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber charSpace = (PdfNumber) operands.get(0); + handler.graphicsState().setCharacterSpacing(charSpace.floatValue()); + } + } + + /** + * A content operator implementation (Tf). + */ + static class SetTextFont implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tf"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfName fontResourceName = (PdfName) operands.get(0); + float size = ((PdfNumber) operands.get(1)).floatValue(); + + PdfDictionary fontsDictionary = resources.getAsDict(PdfName.FONT); + PdfObject pdfObject = fontsDictionary.get(fontResourceName); + CMapAwareDocumentFont font = new CMapAwareDocumentFont((PRIndirectReference) pdfObject); + + handler.graphicsState().setFont(font); + handler.graphicsState().setFontSize(size); + } + } + + /** + * A content operator implementation (Tm). + */ + static class TextSetTextMatrix implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tm"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + handler.textLineMatrix = getMatrix(operands); + handler.textMatrix = handler.textLineMatrix; + } + } + + /** + * A content operator implementation (TD). + */ + static class TextMoveStartNextLineWithLeading implements ContentOperator { + + private final TextMoveStartNextLine moveStartNextLine; + + private final SetTextLeading setTextLeading; + + public TextMoveStartNextLineWithLeading( + TextMoveStartNextLine moveStartNextLine, + SetTextLeading setTextLeading) { + this.moveStartNextLine = moveStartNextLine; + this.setTextLeading = setTextLeading; + } + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "TD"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + float ty = ((PdfNumber) operands.get(1)).floatValue(); + + List tlOperands = new ArrayList<>(1); + tlOperands.add(0, new PdfNumber(-ty)); + setTextLeading.invoke(tlOperands, handler, resources); + moveStartNextLine.invoke(operands, handler, resources); + } + } + + /** + * A content operator implementation (Tj). + */ + static class ShowText implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tj"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfString string = (PdfString) operands.get(0); + handler.displayPdfString(string); + } + } + + /** + * A content operator implementation (T*). + */ + static class TextMoveNextLine implements ContentOperator { + + private final TextMoveStartNextLine moveStartNextLine; + + public TextMoveNextLine(TextMoveStartNextLine moveStartNextLine) { + this.moveStartNextLine = moveStartNextLine; + } + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "T*"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + List tdoperands = new ArrayList<>(2); + tdoperands.add(0, new PdfNumber(0)); + tdoperands.add(1, new PdfNumber(-handler.graphicsState().getLeading())); + moveStartNextLine.invoke(tdoperands, handler, resources); + } + } + + /** + * A content operator implementation (Td). + */ + static class TextMoveStartNextLine implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Td"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + float tx = ((PdfNumber) operands.get(0)).floatValue(); + float ty = ((PdfNumber) operands.get(1)).floatValue(); + + Matrix translationMatrix = new Matrix(tx, ty); + handler.textMatrix = translationMatrix.multiply(handler.textLineMatrix); + handler.textLineMatrix = handler.textMatrix; + } + } + + /** + * A content operator implementation (Tr). + */ + static class SetTextRenderMode implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tr"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber render = (PdfNumber) operands.get(0); + handler.graphicsState().setRenderMode(render.intValue()); + } + } + + /** + * A content operator implementation (Ts). + */ + static class SetTextRise implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Ts"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber rise = (PdfNumber) operands.get(0); + handler.graphicsState().setRise(rise.floatValue()); + } + } + + /** + * A content operator implementation (TL). + */ + static class SetTextLeading implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "TL"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber leading = (PdfNumber) operands.get(0); + handler.graphicsState().setLeading(leading.floatValue()); + } + } + + /** + * A content operator implementation (Tz). + */ + static class SetTextHorizontalScaling implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tz"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber scale = (PdfNumber) operands.get(0); + handler.graphicsState().setHorizontalScaling(scale.floatValue()); + } + } + + /** + * A content operator implementation (Tw). + */ + static class SetTextWordSpacing implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tw"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber wordSpace = (PdfNumber) operands.get(0); + handler.graphicsState().setWordSpacing(wordSpace.floatValue()); + } + } + + /** + * A content operator implementation (BMC). + */ + private static class BeginMarked implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "BMC"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfName tagName = (PdfName) operands.get(0); + String realName = tagName.toString().substring(1).toLowerCase(Locale.ROOT); + if ("artifact".equals(realName) || "placedpdf".equals(realName)) { + handler.pushContext(null); + } else { + handler.pushContext(realName); + } + } + + } + + /** + * A content operator implementation (BDC). + */ + private static class BeginMarkedDict implements ContentOperator { + + /** + * The BDC marked-content operator which brackets a marked-content sequence of objects within the content + * stream. + * + * @param operands list of operands + * @param resources dictionary + * @return PdfDictionary of type BDC marked-content + */ + private static PdfDictionary getBDCDictionary(List operands, PdfDictionary resources) { + PdfObject pdfObject = operands.get(1); + if (pdfObject.isName()) { + PdfDictionary properties = resources.getAsDict(PdfName.PROPERTIES); + PdfIndirectReference ir = properties.getAsIndirectObject((PdfName) pdfObject); + if (ir != null) { + pdfObject = ir.getIndRef(); + } else { + pdfObject = properties.getAsDict((PdfName) pdfObject); + } + } + return (PdfDictionary) pdfObject; + } + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "BDC"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfObject firstOperand = operands.get(0); + String tagName = firstOperand.toString().substring(1).toLowerCase(Locale.ROOT); + if ("artifact".equals(tagName) || "placedpdf".equals(tagName) + || handler.contextNames.peek() == null) { + tagName = null; + } else if ("l".equals(tagName)) { + tagName = "ul"; + } + PdfDictionary attrs = getBDCDictionary(operands, resources); + if (attrs != null && tagName != null) { + PdfString alternateText = attrs.getAsString(PdfName.E); + if (alternateText != null) { + handler.pushContext(tagName); + handler.textFragments + .add(new FinalText(alternateText.toString())); + handler.popContext(); + // ignore rest of the content of this element + handler.pushContext(null); + return; + } else if (attrs.get(PdfName.TYPE) != null) { + // ignore tag for non-tag marked content that sometimes + // shows up. + tagName = ""; + } + } + handler.pushContext(tagName); + } + } + + /** + * A content operator implementation (EMC). + */ + private static class EndMarked implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "EMC"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + handler.popContext(); + } + } + + private class Do implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Do"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfObject firstOperand = operands.get(0); + if (firstOperand instanceof PdfName) { + PdfName name = (PdfName) firstOperand; + PdfDictionary dictionary = resources.getAsDict(PdfName.XOBJECT); + if (dictionary == null) { + return; + } + PdfStream stream = (PdfStream) dictionary.getDirectObject(name); + PdfName subType = stream.getAsName(PdfName.SUBTYPE); + if (PdfName.FORM.equals(subType)) { + PdfDictionary resources2 = stream.getAsDict(PdfName.RESOURCES); + if (resources2 == null) { + resources2 = resources; + } + + byte[] data; + try { + data = getContentBytesFromPdfObject(stream); + } catch (IOException ex) { + throw new ExceptionConverter(ex); + } + new PushGraphicsState().invoke(operands, handler, resources); + processContent(data, resources2); + new PopGraphicsState().invoke(operands, handler, resources); + } + } + + } + + private void processContent(byte[] contentBytes, PdfDictionary resources) { + try { + PdfContentParser pdfContentParser = new PdfContentParser(new PRTokeniser(contentBytes)); + List operands = new ArrayList<>(); + while (!pdfContentParser.parse(operands).isEmpty()) { + PdfLiteral operator = (PdfLiteral) operands.get(operands.size() - 1); + invokeOperator(operator, operands, resources); + } + } catch (Exception e) { + throw new ExceptionConverter(e); + } + } + + + private byte[] getContentBytesFromPdfObject(PdfObject object) throws IOException { + switch (object.type()) { + case PdfObject.INDIRECT: + return getContentBytesFromPdfObject(PdfReader.getPdfObject(object)); + case PdfObject.STREAM: + return PdfReader.getStreamBytes((PRStream) PdfReader.getPdfObject(object)); + case PdfObject.ARRAY: + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (PdfObject element : ((PdfArray) object).getElements()) { + baos.write(getContentBytesFromPdfObject(element)); + } + return baos.toByteArray(); + default: + throw new IllegalStateException("Unsupported type: " + object.getClass().getCanonicalName()); + } + } + } +} diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextLocator.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextLocator.java new file mode 100644 index 000000000..132c16f5e --- /dev/null +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextLocator.java @@ -0,0 +1,966 @@ +/* + Copyright 2014 by Tizra Inc. + The contents of this file are subject to the Mozilla Public License Version 1.1 + (the "License"); you may not use this file except in compliance with the License. + You may obtain a copy of the License at http://www.mozilla.org/MPL/ + + Software distributed under the License is distributed on an "AS IS" basis, + WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + for the specific language governing rights and limitations under the License. + + The Original Code is 'iText, a free JAVA-PDF library'. + + The Initial Developer of the Original Code is Bruno Lowagie. Portions created by + the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie. + All Rights Reserved. + Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer + are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved. + + Contributor(s): all the names of the contributors are added in the source code + where applicable. + + Alternatively, the contents of this file may be used under the terms of the + LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the + provisions of LGPL are applicable instead of those above. If you wish to + allow use of your version of this file only under the terms of the LGPL + License and not to allow others to use your version of this file under + the MPL, indicate your decision by deleting the provisions above and + replace them with the notice and other provisions required by the LGPL. + If you do not delete the provisions above, a recipient may use your version + of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. + + This library is free software; you can redistribute it and/or modify it + under the terms of the MPL as stated above or under the terms of the GNU + Library General Public License as published by the Free Software Foundation; + either version 2 of the License, or any later version. + + This library is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more + details. + */ +package org.openpdf.text.pdf.parser; + +import org.openpdf.text.ExceptionConverter; +import org.openpdf.text.error_messages.MessageLocalization; +import org.openpdf.text.pdf.*; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author dgd + */ +@SuppressWarnings({"WeakerAccess", "unused"}) +public class PdfContentTextLocator extends PdfContentStreamHandler { + + private final ArrayList accumulator = new ArrayList<>(); + + private final ArrayList fragments = new ArrayList<>(); + private final ArrayList fragmentsWidths = new ArrayList<>(); + + /** + * A map with all supported operators operators (PDF syntax). + */ + private Map operators; + + private final int page; + private final Pattern p; + + + public PdfContentTextLocator(TextAssembler renderListener, String pattern, int page) { + super(renderListener); + if(pattern == null) throw new IllegalArgumentException("Pattern cannot be null"); + //We check for length because we want to include whitespaces as possible patterns + if(pattern.isEmpty()) throw new IllegalArgumentException("Pattern sequence must be longer than 0"); + this.p = Pattern.compile(pattern); + this.page = page; + installDefaultOperators(); + reset(); + } + + private static Matrix getMatrix(List operands) { + float a = ((PdfNumber) operands.get(0)).floatValue(); + float b = ((PdfNumber) operands.get(1)).floatValue(); + float c = ((PdfNumber) operands.get(2)).floatValue(); + float d = ((PdfNumber) operands.get(3)).floatValue(); + float e = ((PdfNumber) operands.get(4)).floatValue(); + float f = ((PdfNumber) operands.get(5)).floatValue(); + return new Matrix(a, b, c, d, e, f); + } + + /** + * Registers a content operator that will be called when the specified operator string is encountered during content + * processing. Each operator may be registered only once (it is not legal to have multiple operators with the same + * operatorString) + * + * @param operator the operator that will receive notification when the operator is encountered + * @since 2.1.7 + */ + public void registerContentOperator(ContentOperator operator) { + String operatorString = operator.getOperatorName(); + if (operators.containsKey(operatorString)) { + throw new IllegalArgumentException(MessageLocalization.getComposedMessage( + "operator.1.already.registered", operatorString)); + } + operators.put(operatorString, operator); + } + + /** + * Loads all the supported graphics and text state operators in a map. + */ + protected void installDefaultOperators() { + operators = new HashMap<>(); + registerContentOperator(new PushGraphicsState()); + registerContentOperator(new PopGraphicsState()); + registerContentOperator(new ModifyCurrentTransformationMatrix()); + registerContentOperator(new ProcessGraphicsStateResource()); + + SetTextCharacterSpacing tcOperator = new SetTextCharacterSpacing(); + registerContentOperator(tcOperator); + SetTextWordSpacing twOperator = new SetTextWordSpacing(); + registerContentOperator(twOperator); + registerContentOperator(new SetTextHorizontalScaling()); + SetTextLeading tlOperator = new SetTextLeading(); + registerContentOperator(tlOperator); + registerContentOperator(new SetTextFont()); + registerContentOperator(new SetTextRenderMode()); + registerContentOperator(new SetTextRise()); + + registerContentOperator(new BeginText()); + registerContentOperator(new EndText()); + + TextMoveStartNextLine tdOperator = new TextMoveStartNextLine(); + registerContentOperator(tdOperator); + registerContentOperator(new TextMoveStartNextLineWithLeading(tdOperator, tlOperator)); + registerContentOperator(new TextSetTextMatrix()); + TextMoveNextLine tstarOperator = + new TextMoveNextLine(tdOperator); + registerContentOperator(tstarOperator); + + ShowText tjOperator = new ShowText(); + registerContentOperator(new ShowText()); + MoveNextLineAndShowText tickOperator = + new MoveNextLineAndShowText(tstarOperator, tjOperator); + registerContentOperator(tickOperator); + registerContentOperator( + new MoveNextLineAndShowTextWithSpacing(twOperator, tcOperator, tickOperator)); + registerContentOperator(new ShowTextArray()); + // marked sections + registerContentOperator(new BeginMarked()); + registerContentOperator(new BeginMarkedDict()); + registerContentOperator(new EndMarked()); + + registerContentOperator(new Do()); + } + + /** + * Get the operator to process a command with a given name + * + * @param operatorName name of the operator that we might need to call + * @return the operator or null if none present + */ + public Optional lookupOperator(String operatorName) { + return Optional.ofNullable(operators.get(operatorName)); + } + + /** + * Invokes an operator. + * + * @param operator the PDF Syntax of the operator + * @param operands a list with operands + * @param resources Pdf Resources found in the file containing the stream. + */ + public void invokeOperator(PdfLiteral operator, List operands, PdfDictionary resources) { + String operatorName = operator.toString(); + lookupOperator(operatorName) + .ifPresent(contentOperator -> contentOperator.invoke(operands, this, resources)); + } + + void popContext() { + renderListener.reset(); + } + + void pushContext(String newContextName) { + } + + public void reset() { + if (gsStack == null || gsStack.isEmpty()) { + gsStack = new Stack<>(); + } + gsStack.add(new GraphicsState()); + textMatrix = null; + textLineMatrix = null; + } + + /** + * Search for a pattern in a PdfString + * and if found, collect its bounding box + * + * @param string the text to inspect + */ + void displayPdfString(PdfString string) { + + String decoded; + byte[] bytes; + if (BaseFont.IDENTITY_H.equals(graphicsState().getFont().getEncoding())) { + bytes = string.toString().getBytes(StandardCharsets.UTF_16); + } else { + bytes = string.toString().getBytes(); + } + decoded = graphicsState().getFont().decode(bytes, 0, bytes.length); + char[] chars = decoded.toCharArray(); + final float[] widths = new float[chars.length + 1]; + Vector startPoint = new Vector(0, 0, 1f).cross(textMatrix); + float startWidth = startPoint.get(0); + float totalWidth = 0; + widths[0] = startWidth; + int counter = 1; + for (char c : chars) { + float w = graphicsState().getFont().getWidth(c) / 1000.0f; + float wordSpacing = Character.isSpaceChar(c) ? graphicsState().getWordSpacing() : 0f; + float blockWidth = (w * graphicsState().getFontSize() + graphicsState().getCharacterSpacing() + wordSpacing) + * graphicsState().getHorizontalScaling(); + totalWidth += blockWidth; + widths[counter] = startWidth + totalWidth; + counter++; + } + + float pdfStringWidth = startWidth + totalWidth; + float y = new Vector(0, 0, 1f).cross(textMatrix).get(1); + float y1 = y + graphicsState().getFontDescentDescriptor(); + float y2 = y + graphicsState().getFontAscentDescriptor(); + + Matcher m = p.matcher(decoded); + while (m.find()) { + float x1 = widths[m.start()]; + float x2 = widths[m.end()]; + MatchedPattern mp = new MatchedPattern(decoded, this.page, x1, y1, x2, y2); + accumulator.add(mp); + } + + textMatrix = new Matrix(totalWidth, 0).multiply(textMatrix); + } + + private float convertHeightToUser(float height) { + Vector endPos = new Vector(0, height, 1f).cross(textMatrix); + return endPos.get(1); + } + + @Override + public String getResultantText() { + return ""; + } + + /** + * @return list of text strips that matches + */ + public ArrayList getMatchedPatterns() { + return this.accumulator; + } + + /** + * A content operator implementation (TJ). + */ + static class ShowTextArray implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "TJ"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfArray array = (PdfArray) operands.get(0); + for (PdfObject entryObj : array.getElements()) { + if (entryObj instanceof PdfString) { + handler.displayPdfString((PdfString) entryObj); + } else { + float tj = ((PdfNumber) entryObj).floatValue(); + handler.applyTextAdjust(tj); + } + } + + } + } + + /** + * A content operator implementation (BT). + */ + static class BeginText implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "BT"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + handler.textMatrix = new Matrix(); + handler.textLineMatrix = handler.textMatrix; + } + } + + /** + * A content operator implementation (ET). + */ + static class EndText implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "ET"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + handler.textMatrix = null; + handler.textLineMatrix = null; + } + } + + /** + * A content operator implementation (cm). + */ + static class ModifyCurrentTransformationMatrix implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "cm"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + Matrix matrix = getMatrix(operands); + GraphicsState graphicsState = handler.gsStack.peek(); + graphicsState.multiplyCtm(matrix); + } + } + + /** + * A content operator implementation ('). + */ + static class MoveNextLineAndShowText implements ContentOperator { + + private final TextMoveNextLine textMoveNextLine; + private final ShowText showText; + + public MoveNextLineAndShowText( + TextMoveNextLine textMoveNextLine, + ShowText showText) { + this.textMoveNextLine = textMoveNextLine; + this.showText = showText; + } + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "'"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + textMoveNextLine.invoke(new ArrayList<>(0), handler, resources); + showText.invoke(operands, handler, resources); + } + } + + /** + * A content operator implementation ("). + */ + static class MoveNextLineAndShowTextWithSpacing implements ContentOperator { + + private final SetTextWordSpacing setTextWordSpacing; + private final SetTextCharacterSpacing setTextCharacterSpacing; + private final MoveNextLineAndShowText moveNextLineAndShowText; + + public MoveNextLineAndShowTextWithSpacing( + SetTextWordSpacing setTextWordSpacing, + SetTextCharacterSpacing setTextCharacterSpacing, + MoveNextLineAndShowText moveNextLineAndShowText) { + this.setTextWordSpacing = setTextWordSpacing; + this.setTextCharacterSpacing = setTextCharacterSpacing; + this.moveNextLineAndShowText = moveNextLineAndShowText; + } + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "\""; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber aw = (PdfNumber) operands.get(0); + PdfNumber ac = (PdfNumber) operands.get(1); + PdfString string = (PdfString) operands.get(2); + + List twOperands = new ArrayList<>(1); + twOperands.addFirst(aw); + setTextWordSpacing.invoke(twOperands, handler, resources); + + List tcOperands = new ArrayList<>(1); + tcOperands.addFirst(ac); + setTextCharacterSpacing.invoke(tcOperands, handler, resources); + + List tickOperands = new ArrayList<>(1); + tickOperands.addFirst(string); + moveNextLineAndShowText.invoke(tickOperands, handler, resources); + } + } + + /** + * A content operator implementation (Q). + */ + static class PopGraphicsState implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Q"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + handler.gsStack.pop(); + } + } + + /** + * A content operator implementation (gs). + */ + static class ProcessGraphicsStateResource implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "gs"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfName dictionaryName = (PdfName) operands.getFirst(); + PdfDictionary extGState = resources.getAsDict(PdfName.EXTGSTATE); + if (extGState == null) { + throw new IllegalArgumentException( + MessageLocalization.getComposedMessage( + "resources.do.not.contain.extgstate.entry.unable.to.process.operator.1", + getOperatorName())); + } + PdfDictionary gsDic = extGState.getAsDict(dictionaryName); + if (gsDic == null) { + throw new IllegalArgumentException(MessageLocalization.getComposedMessage( + "1.is.an.unknown.graphics.state.dictionary", dictionaryName)); + } + + // at this point, all we care about is the FONT entry in the GS + // dictionary + PdfArray fontParameter = gsDic.getAsArray(PdfName.FONT); + if (fontParameter != null) { + PdfObject pdfObject = fontParameter.getPdfObject(0); + CMapAwareDocumentFont font = new CMapAwareDocumentFont((PRIndirectReference) pdfObject); + float size = fontParameter.getAsNumber(1).floatValue(); + handler.graphicsState().setFont(font); + handler.graphicsState().setFontSize(size); + } + } + } + + /** + * A content operator implementation (q). + */ + static class PushGraphicsState implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "q"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + GraphicsState gs = handler.gsStack.peek(); + GraphicsState copy = new GraphicsState(gs); + handler.gsStack.push(copy); + } + } + + /** + * A content operator implementation (Tc). + */ + static class SetTextCharacterSpacing implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tc"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber charSpace = (PdfNumber) operands.getFirst(); + handler.graphicsState().setCharacterSpacing(charSpace.floatValue()); + } + } + + /** + * A content operator implementation (Tf). + */ + static class SetTextFont implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tf"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfName fontResourceName = (PdfName) operands.get(0); + float size = ((PdfNumber) operands.get(1)).floatValue(); + + PdfDictionary fontsDictionary = resources.getAsDict(PdfName.FONT); + PdfObject pdfObject = fontsDictionary.get(fontResourceName); + CMapAwareDocumentFont font = new CMapAwareDocumentFont((PRIndirectReference) pdfObject); + + handler.graphicsState().setFont(font); + handler.graphicsState().setFontSize(size); + } + } + + /** + * A content operator implementation (Tm). + */ + static class TextSetTextMatrix implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tm"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + handler.textLineMatrix = getMatrix(operands); + handler.textMatrix = handler.textLineMatrix; + } + } + + /** + * A content operator implementation (TD). + */ + static class TextMoveStartNextLineWithLeading implements ContentOperator { + + private final TextMoveStartNextLine moveStartNextLine; + + private final SetTextLeading setTextLeading; + + public TextMoveStartNextLineWithLeading( + TextMoveStartNextLine moveStartNextLine, + SetTextLeading setTextLeading) { + this.moveStartNextLine = moveStartNextLine; + this.setTextLeading = setTextLeading; + } + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "TD"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + float ty = ((PdfNumber) operands.get(1)).floatValue(); + + List tlOperands = new ArrayList<>(1); + tlOperands.addFirst(new PdfNumber(-ty)); + setTextLeading.invoke(tlOperands, handler, resources); + moveStartNextLine.invoke(operands, handler, resources); + } + } + + /** + * A content operator implementation (Tj). + */ + static class ShowText implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tj"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfString string = (PdfString) operands.getFirst(); + handler.displayPdfString(string); + } + } + + /** + * A content operator implementation (T*). + */ + static class TextMoveNextLine implements ContentOperator { + + private final TextMoveStartNextLine moveStartNextLine; + + public TextMoveNextLine(TextMoveStartNextLine moveStartNextLine) { + this.moveStartNextLine = moveStartNextLine; + } + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "T*"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + List tdoperands = new ArrayList<>(2); + tdoperands.add(0, new PdfNumber(0)); + tdoperands.add(1, new PdfNumber(-handler.graphicsState().getLeading())); + moveStartNextLine.invoke(tdoperands, handler, resources); + } + } + + /** + * A content operator implementation (Td). + */ + static class TextMoveStartNextLine implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Td"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + float tx = ((PdfNumber) operands.get(0)).floatValue(); + float ty = ((PdfNumber) operands.get(1)).floatValue(); + + Matrix translationMatrix = new Matrix(tx, ty); + handler.textMatrix = translationMatrix.multiply(handler.textLineMatrix); + handler.textLineMatrix = handler.textMatrix; + } + } + + /** + * A content operator implementation (Tr). + */ + static class SetTextRenderMode implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tr"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber render = (PdfNumber) operands.getFirst(); + handler.graphicsState().setRenderMode(render.intValue()); + } + } + + /** + * A content operator implementation (Ts). + */ + static class SetTextRise implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Ts"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber rise = (PdfNumber) operands.getFirst(); + handler.graphicsState().setRise(rise.floatValue()); + } + } + + /** + * A content operator implementation (TL). + */ + static class SetTextLeading implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "TL"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber leading = (PdfNumber) operands.getFirst(); + handler.graphicsState().setLeading(leading.floatValue()); + } + } + + /** + * A content operator implementation (Tz). + */ + static class SetTextHorizontalScaling implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tz"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber scale = (PdfNumber) operands.getFirst(); + handler.graphicsState().setHorizontalScaling(scale.floatValue()); + } + } + + /** + * A content operator implementation (Tw). + */ + static class SetTextWordSpacing implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Tw"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfNumber wordSpace = (PdfNumber) operands.getFirst(); + handler.graphicsState().setWordSpacing(wordSpace.floatValue()); + } + } + + /** + * A content operator implementation (BMC). + */ + private static class BeginMarked implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "BMC"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfName tagName = (PdfName) operands.getFirst(); + String realName = tagName.toString().substring(1).toLowerCase(Locale.ROOT); + if ("artifact".equals(realName) || "placedpdf".equals(realName)) { + handler.pushContext(null); + } else { + handler.pushContext(realName); + } + } + + } + + /** + * A content operator implementation (BDC). + */ + private static class BeginMarkedDict implements ContentOperator { + + /** + * The BDC marked-content operator which brackets a marked-content sequence of objects within the content + * stream. + * + * @param operands list of operands + * @param resources dictionary + * @return PdfDictionary of type BDC marked-content + */ + private static PdfDictionary getBDCDictionary(List operands, PdfDictionary resources) { + PdfObject pdfObject = operands.get(1); + if (pdfObject.isName()) { + PdfDictionary properties = resources.getAsDict(PdfName.PROPERTIES); + PdfIndirectReference ir = properties.getAsIndirectObject((PdfName) pdfObject); + if (ir != null) { + pdfObject = ir.getIndRef(); + } else { + pdfObject = properties.getAsDict((PdfName) pdfObject); + } + } + return (PdfDictionary) pdfObject; + } + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "BDC"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfObject firstOperand = operands.getFirst(); + String tagName = firstOperand.toString().substring(1).toLowerCase(Locale.ROOT); + if ("artifact".equals(tagName) || "placedpdf".equals(tagName)) { + tagName = null; + } else if ("l".equals(tagName)) { + tagName = "ul"; + } + PdfDictionary attrs = getBDCDictionary(operands, resources); + if (attrs != null && tagName != null) { + PdfString alternateText = attrs.getAsString(PdfName.E); + if (alternateText != null) { + handler.pushContext(tagName); + handler.popContext(); + // ignore rest of the content of this element + handler.pushContext(null); + return; + } else if (attrs.get(PdfName.TYPE) != null) { + // ignore tag for non-tag marked content that sometimes + // shows up. + tagName = ""; + } + } + handler.pushContext(tagName); + } + } + + /** + * A content operator implementation (EMC). + */ + private static class EndMarked implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "EMC"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + handler.popContext(); + } + } + + private class Do implements ContentOperator { + + /** + * @see ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Do"; + } + + @Override + public void invoke(List operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfObject firstOperand = operands.getFirst(); + if (firstOperand instanceof PdfName) { + PdfName name = (PdfName) firstOperand; + PdfDictionary dictionary = resources.getAsDict(PdfName.XOBJECT); + if (dictionary == null) { + return; + } + PdfStream stream = (PdfStream) dictionary.getDirectObject(name); + PdfName subType = stream.getAsName(PdfName.SUBTYPE); + if (PdfName.FORM.equals(subType)) { + PdfDictionary resources2 = stream.getAsDict(PdfName.RESOURCES); + if (resources2 == null) { + resources2 = resources; + } + + byte[] data; + try { + data = getContentBytesFromPdfObject(stream); + } catch (IOException ex) { + throw new ExceptionConverter(ex); + } + new PushGraphicsState().invoke(operands, handler, resources); + processContent(data, resources2); + new PopGraphicsState().invoke(operands, handler, resources); + } + } + + } + + private void processContent(byte[] contentBytes, PdfDictionary resources) { + try { + PdfContentParser pdfContentParser = new PdfContentParser(new PRTokeniser(contentBytes)); + List operands = new ArrayList<>(); + while (!pdfContentParser.parse(operands).isEmpty()) { + PdfLiteral operator = (PdfLiteral) operands.getLast(); + invokeOperator(operator, operands, resources); + } + } catch (Exception e) { + throw new ExceptionConverter(e); + } + } + + + private byte[] getContentBytesFromPdfObject(PdfObject object) throws IOException { + switch (object.type()) { + case PdfObject.INDIRECT: + return getContentBytesFromPdfObject(PdfReader.getPdfObject(object)); + case PdfObject.STREAM: + return PdfReader.getStreamBytes((PRStream) PdfReader.getPdfObject(object)); + case PdfObject.ARRAY: + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (PdfObject element : ((PdfArray) object).getElements()) { + baos.write(getContentBytesFromPdfObject(element)); + } + return baos.toByteArray(); + default: + throw new IllegalStateException("Unsupported type: " + object.getClass().getCanonicalName()); + } + } + } +} diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextExtractor.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextExtractor.java index 84f4028db..70c2dc2b3 100644 --- a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextExtractor.java +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextExtractor.java @@ -195,7 +195,7 @@ public String getTextFromPage(int page, boolean useContainerMarkup) throws IOExc renderListener.reset(); renderListener.setPage(page); - PdfContentStreamHandler handler = new PdfContentStreamHandler(renderListener); + PdfContentStreamHandler handler = new PdfContentTextExtractor(renderListener); processContent(getContentBytesForPage(page), resources, handler); return handler.getResultantText(); } diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextLocator.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextLocator.java new file mode 100644 index 000000000..486ff29c5 --- /dev/null +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextLocator.java @@ -0,0 +1,227 @@ +/* + * Copyright 2008 by Kevin Day. + * + * Contributions copyright 2014 Tizra Inc. + * + * The contents of this file are subject to the Mozilla Public License Version 1.1 + * (the "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the License. + * + * The Original Code is 'iText, a free JAVA-PDF library'. + * + * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by + * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie. + * All Rights Reserved. + * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer + * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved. + * + * Contributor(s): all the names of the contributors are added in the source code + * where applicable. + * + * Alternatively, the contents of this file may be used under the terms of the + * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the + * provisions of LGPL are applicable instead of those above. If you wish to + * allow use of your version of this file only under the terms of the LGPL + * License and not to allow others to use your version of this file under + * the MPL, indicate your decision by deleting the provisions above and + * replace them with the notice and other provisions required by the LGPL. + * If you do not delete the provisions above, a recipient may use your version + * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of the MPL as stated above or under the terms of the GNU + * Library General Public License as published by the Free Software Foundation; + * either version 2 of the License, or any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more + * details. + * + * If you didn't download this code from the following link, you should check if + * you aren't using an obsolete version: + * https://github.com/LibrePDF/OpenPDF + */ +package org.openpdf.text.pdf.parser; + + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.openpdf.text.ExceptionConverter; +import org.openpdf.text.pdf.PRIndirectReference; +import org.openpdf.text.pdf.PRStream; +import org.openpdf.text.pdf.PRTokeniser; +import org.openpdf.text.pdf.PdfArray; +import org.openpdf.text.pdf.PdfContentParser; +import org.openpdf.text.pdf.PdfDictionary; +import org.openpdf.text.pdf.PdfLiteral; +import org.openpdf.text.pdf.PdfName; +import org.openpdf.text.pdf.PdfObject; +import org.openpdf.text.pdf.PdfReader; +import org.openpdf.text.pdf.RandomAccessFileOrArray; + +/** + * Locates text pattern coordinates inside a PDF file. + * + * @since 2.1.4 + */ +@SuppressWarnings("WeakerAccess") +public class PdfTextLocator { + + /** + * The PdfReader that holds the PDF file. + */ + private final PdfReader reader; + + /** + * The {@link TextAssembler} that will receive render notifications and provide resultant text + */ + private final TextAssembler renderListener; + + /** + * Creates a new Text Locator object, using a {@link TextAssembler} as the render listener + * + * @param reader the reader with the PDF + */ + public PdfTextLocator(PdfReader reader) { + this(reader, new MarkedUpTextAssembler(reader)); + } + + /** + * Creates a new Text Extractor object, using a {@link TextAssembler} as the render listener + * + * @param reader the reader with the PDF + * @param usePdfMarkupElements should we use higher level tags for PDF markup entities? + */ + public PdfTextLocator(PdfReader reader, String pattern, boolean usePdfMarkupElements) { + this(reader, new MarkedUpTextAssembler(reader, usePdfMarkupElements)); + } + + /** + * Creates a new Text Locator object. + * + * @param reader the reader with the PDF + * @param renderListener the render listener that will be used to analyze renderText operations and provide + * resultant text + */ + public PdfTextLocator(PdfReader reader, TextAssembler renderListener) { + this.reader = reader; + this.renderListener = renderListener; + } + + /** + * Gets the content bytes of a page. + * + * @param pageNum the 1-based page number of page you want get the content stream from + * @return a byte array with the effective content stream of a page + * @throws IOException + */ + private byte[] getContentBytesForPage(int pageNum) throws IOException { + try (RandomAccessFileOrArray ignored = reader.getSafeFile()) { + PdfDictionary pageDictionary = reader.getPageN(pageNum); + PdfObject contentObject = pageDictionary.get(PdfName.CONTENTS); + return getContentBytesFromContentObject(contentObject); + } + } + + /** + * Gets the content bytes from a content object, which may be a reference a stream or an array. + * + * @param contentObject the object to read bytes from + * @return the content bytes + * @throws IOException + */ + private byte[] getContentBytesFromContentObject(PdfObject contentObject) throws IOException { + final byte[] result; + switch (contentObject.type()) { + case PdfObject.INDIRECT: + PRIndirectReference ref = (PRIndirectReference) contentObject; + PdfObject directObject = PdfReader.getPdfObject(ref); + result = getContentBytesFromContentObject(directObject); + break; + case PdfObject.STREAM: + PRStream stream = (PRStream) PdfReader.getPdfObject(contentObject); + result = PdfReader.getStreamBytes(stream); + break; + case PdfObject.ARRAY: + // Stitch together all content before calling processContent(), + // because + // processContent() resets state. + ByteArrayOutputStream allBytes = new ByteArrayOutputStream(); + PdfArray contentArray = (PdfArray) contentObject; + for (PdfObject pdfObject : contentArray.getElements()) { + allBytes.write(getContentBytesFromContentObject(pdfObject)); + } + result = allBytes.toByteArray(); + break; + default: + throw new IllegalStateException("Unable to handle Content of type " + contentObject.getClass()); + } + return result; + } + + /** + * Locates text pattern inside a page + * + * @param page page number we are interested in + * @param pattern text to match + * @return ArrayList List of matched text patterns with coordinates. + * @throws IOException on error + */ + public ArrayList searchPage(int page, String pattern) throws IOException { + PdfDictionary pageDict = reader.getPageN(page); + if (pageDict == null) { + return new ArrayList<>(); + } + PdfDictionary resources = pageDict.getAsDict(PdfName.RESOURCES); + renderListener.reset(); + renderListener.setPage(page); + PdfContentTextLocator handler = new PdfContentTextLocator(renderListener, pattern, page); + processContent(getContentBytesForPage(page), resources, handler); + return handler.getMatchedPatterns(); + } + + /** + * Locates text pattern inside a PDF + * + * @param pattern text to match + * @return ArrayList List of matched text patterns with coordinates. + * @throws IOException on error + */ + public ArrayList searchFile(String pattern) throws IOException { + ArrayList res = new ArrayList<>(); + for (int page = 1; page <= reader.getNumberOfPages(); page++) { + res.addAll(searchPage(page, pattern)); + } + return res; + } + + /** + * Processes PDF syntax + * + * @param contentBytes the bytes of a content stream + * @param resources the resources that come with the content stream + * @param handler interprets events caused by recognition of operations in a content stream. + */ + public void processContent(byte[] contentBytes, PdfDictionary resources, + PdfContentTextLocator handler) { + handler.pushContext("div class='t-extracted-page'"); + try { + PdfContentParser ps = new PdfContentParser(new PRTokeniser(contentBytes)); + List operands = new ArrayList<>(); + while (!ps.parse(operands).isEmpty()) { + PdfLiteral operator = (PdfLiteral) operands.getLast(); + handler.invokeOperator(operator, operands, resources); + } + } catch (Exception e) { + throw new ExceptionConverter(e); + } + handler.popContext(); + } +}