java - 使用 JAVA 搜索和替换 PDF 中的文本

需要将pdf中的文本替换为不同的语言。第一步，我尝试使用 itextpdf ad pdfbox API 搜索并替换 pdf 文件中的文本。

使用以下代码片段，该代码片段使用 itextpdf api 从源 PDF 文件中搜索文本“Hello”并将其替换为“Hi”。新的 PDF 已创建，没有任何文本替换。

public void manipulatePdf(String src, String dest) throws Exception {
    PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC), new PdfWriter(DEST));
    int noOfPages = pdfDoc.getNumberOfPages();
    for (int i = 1; i < noOfPages; i++) {
        PdfPage page = pdfDoc.getPage(i);
        PdfDictionary dict = page.getPdfObject();
        PdfObject object = dict.get(PdfName.Contents);
        if (object instanceof PdfStream) {
            PdfStream stream = (PdfStream) object;
            byte[] data = stream.getBytes();
            stream.setData(new String(data).replace("Hello", "Hi").getBytes("UTF-8"));
        }
    }
    pdfDoc.close();
}

也使用 apache pdfbox 来实现同样的事情，但没有运气。以下是供引用的代码片段。

    public static PDDocument replaceText(PDDocument document, String searchString, String replacement)
        throws IOException {        
    for (PDPage page : document.getPages()) {
        PDFStreamParser parser = new PDFStreamParser(page);
        parser.parse();
        List tokens = parser.getTokens();
        for (int j = 0; j < tokens.size(); j++) {
            Object next = tokens.get(j);
            if (next instanceof Operator) {
                Operator op = (Operator) next;
                // Tj and TJ are the two operators that display strings in a PDF
                if (op.getName().equals("Tj")) {
                    // Tj takes one operator and that is the string to display
                    // so lets update that operator
                    COSString previous = (COSString) tokens.get(j - 1);
                    String string = previous.getString();
                    //System.out.println(new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));
                    string = string.replaceFirst(searchString, replacement);
                    previous.setValue(string.getBytes());

                } else if (op.getName().equals("TJ")) {
                    COSArray previous = (COSArray) tokens.get(j - 1);
                    for (int k = 0; k < previous.size(); k++) {
                        Object arrElement = previous.getObject(k);
                        if (arrElement instanceof COSString) {
                            COSString cosString = (COSString) arrElement;
                            String string = cosString.getString();
                            //System.out.println("22::"+new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));
                            string = StringUtils.replaceOnce(string, searchString, replacement);
                            cosString.setValue(string.getBytes());
                        }
                    }
                }
            }
        }

        PDStream updatedStream = new PDStream(document);
        OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
        ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
        tokenWriter.writeTokens(tokens);
         // save content
        page.setContents(updatedStream);
        out.close();
    }

任何解决方案/建议都将受到高度赞赏。

最佳答案

这是一个工作版本，使用 PDFBox

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;

public final class PDFEditor {

    private PDFEditor() {
    }

    public static void main(String[] args) throws IOException {
        PDDocument document = null;
        document = PDDocument.load(new File("src path"));
        document = replaceText(document, "Hello", "Hi");
        document.save("target Path");
        document.close();
    }

    private static PDDocument replaceText(PDDocument document, String searchString, String replacement) throws IOException {
        if (StringUtils.isEmpty(searchString) || StringUtils.isEmpty(replacement)) {
            return document;
        }

        for (PDPage page : document.getPages()) {
            PDFStreamParser parser = new PDFStreamParser(page);
            parser.parse();
            List<?> tokens = parser.getTokens();

            for (int j = 0; j < tokens.size(); j++) {
                Object next = tokens.get(j);
                if (next instanceof Operator) {
                    Operator op = (Operator) next;

                    String pstring = "";
                    int prej = 0;

                    if (op.getName().equals("Tj")) {
                        COSString previous = (COSString) tokens.get(j - 1);
                        String string = previous.getString();
                        string = string.replaceFirst(searchString, replacement);
                        previous.setValue(string.getBytes());
                    } else if (op.getName().equals("TJ")) {
                        COSArray previous = (COSArray) tokens.get(j - 1);
                        for (int k = 0; k < previous.size(); k++) {
                            Object arrElement = previous.getObject(k);
                            if (arrElement instanceof COSString) {
                                COSString cosString = (COSString) arrElement;
                                String string = cosString.getString();

                                if (j == prej) {
                                    pstring += string;
                                } else {
                                    prej = j;
                                    pstring = string;
                                }
                            }
                        }

                        if (searchString.equals(pstring.trim())) {
                            COSString cosString2 = (COSString) previous.getObject(0);
                            cosString2.setValue(replacement.getBytes());

                            int total = previous.size() - 1;
                            for (int k = total; k > 0; k--) {
                                previous.remove(k);
                            }
                        }
                    }
                }
            }
            PDStream updatedStream = new PDStream(document);
            OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
            ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
            tokenWriter.writeTokens(tokens);
            out.close();
            page.setContents(updatedStream);
        }

        return document;
    }

}

依赖关系:

<dependency>
    <groupId>com.itextpdf</groupId>
    <artifactId>itextpdf</artifactId>
    <version>5.0.6</version>
</dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.11</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
    <version>3.0</version>
</dependency>

关于java - 使用 JAVA 搜索和替换 PDF 中的文本，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/52027733/

java - 使用 JAVA 搜索和替换 PDF 中的文本

上一篇：Java 程序在 "for"循环后卡住

下一篇：java - 使用 PHP(可能是使用 shell_exec() 函数的命令行)，如何确定 Java 应用程序/jar 文件当前是否正在运行？