Hi,

I am trying to use PDF box to find and replace the text in PDF using the
following code.

But this does not work with my PDF. I am attaching the input.pdf and this
java code. Can anyone please let me know what is wrong here.

Thank you
Muthu



import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;

public class PdfBoxTester {

private static final String TEST_PDF = "/shn/input.pdf";

private static final Pattern TOKEN_PATTERN =
Pattern.compile("[0-9a-f]{64}+|[0-9A-F]{40}+");

public static void main(String[] args) throws Exception {
substituteTokens();

}

private static void substituteTokens() throws IOException {
PDDocument document = null;
try (InputStream inputStream = new FileInputStream(new File(TEST_PDF))) {
try {
document = PDDocument.load(inputStream);
if (document.isEncrypted()) {
throw new IOException("Error: Encrypted documents are not supported for
this example.");
}
for (PDPage page : document.getPages()) {
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List<Object> tokens = parser.getTokens();
List<Object> newTokens = new ArrayList<Object>();
for (Object token : tokens) {
if (token instanceof Operator) {
Operator op = (Operator) token;
if (op.getName().equals("TJ") || op.getName().equals("Tj")) {

Object argumentToken = newTokens.get(newTokens.size() - 1);
if (argumentToken instanceof COSString) {
COSString stringToken = (COSString) argumentToken;
Collection<String> tokenStrings = collectTokens(stringToken.getString());
if (!tokenStrings.isEmpty()) {
String detokenizedString = substituteTokens(stringToken.getString(),
tokenStrings, "static replacement");
if (detokenizedString != null) {
stringToken.setValue(detokenizedString.getBytes());
}
}
}
}
}
newTokens.add(token);
}
PDStream newContents = new PDStream(document);
OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter writer = new ContentStreamWriter(out);
writer.writeTokens(newTokens);
out.close();
page.setContents(newContents);
}
FileOutputStream fileOutputStream = new FileOutputStream(new
File("/shn/output.pdf"));
document.save(fileOutputStream);
} finally {
if (document != null) {
document.close();
}
inputStream.close();
}
}
}

public static Collection<String> collectTokens(String tokenizedText) throws
IOException {
Set<String> tokens = new HashSet<>();
Matcher matcher = TOKEN_PATTERN.matcher(tokenizedText);
while (matcher.find()) {
tokens.add(matcher.group());
}
return tokens;
}

public static String substituteTokens(String text, Collection<String>
tokens, String staticReplacementText) {
String result = text;
for (String token : tokens) {
result = result.replace(token, staticReplacementText);
}
return result;
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: users-unsubscr...@pdfbox.apache.org
For additional commands, e-mail: users-h...@pdfbox.apache.org

Reply via email to