add java version of document analyzer

This commit is contained in:
Jörg Prante 2023-10-24 15:01:25 +02:00
parent ff5512b813
commit 2fdccf2c79
4 changed files with 243 additions and 1 deletions

View file

@ -1,5 +1,5 @@
group = org.xbib.graphics
name = graphics
version = 5.0.0
version = 5.0.1
org.gradle.warning.mode = ALL

View file

@ -1,5 +1,6 @@
module org.xbib.graphics.pdfbox {
exports org.xbib.graphics.pdfbox;
exports org.xbib.graphics.pdfbox.analyze;
exports org.xbib.graphics.pdfbox.color;
exports org.xbib.graphics.pdfbox.draw;
exports org.xbib.graphics.pdfbox.font;

View file

@ -0,0 +1,208 @@
package org.xbib.graphics.pdfbox.analyze;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import java.awt.geom.Point2D;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
public class DocumentAnalyzer {
private static final Logger logger = Logger.getLogger(DocumentAnalyzer.class.getName());
private final Map<String, Object> result = new LinkedHashMap<>();
private final Set<COSStream> seen = new HashSet<>();
@SuppressWarnings("unchecked")
public DocumentAnalyzer(File file) {
try (PDDocument document = Loader.loadPDF(file)) {
documentToResult(document);
List<Map<String, Object>> pages = new ArrayList<>();
int imagecount = 0;
int pagecount = document.getNumberOfPages();
for (int i = 0; i < pagecount; i++) {
PDPage pdPage = document.getPage(i);
Map<String, Object> pageMap = analyze(i, pdPage);
List<Map<String, Object>> list = (List<Map<String, Object>>) pageMap.get("images");
imagecount += list.size();
pages.add(pageMap);
}
result.put("pages", pages);
result.put("imagecount", imagecount);
} catch (Exception e) {
logger.log(Level.SEVERE, e.getMessage(), e);
}
}
private void documentToResult(PDDocument document) {
try {
result.put("author", document.getDocumentInformation().getAuthor());
result.put("creator", document.getDocumentInformation().getCreator());
result.put("producer", document.getDocumentInformation().getProducer());
result.put("title", document.getDocumentInformation().getTitle());
result.put("pagecount", document.getNumberOfPages());
Calendar calendar = document.getDocumentInformation().getCreationDate();
if (calendar != null) {
result.put("creationDate", calendar.toInstant());
}
calendar = document.getDocumentInformation().getModificationDate();
if (calendar != null) {
result.put("modificationDate", calendar.toInstant());
}
} catch (Exception e) {
// NPE if creation/modification dates are borked
/**
* java.lang.NullPointerException: null
* at java.text.SimpleDateFormat.matchZoneString(SimpleDateFormat.java:1695) ~[?:?]
* at java.text.SimpleDateFormat.subParseZoneString(SimpleDateFormat.java:1763) ~[?:?]
* at java.text.SimpleDateFormat.subParse(SimpleDateFormat.java:2169) ~[?:?]
* at java.text.SimpleDateFormat.parse(SimpleDateFormat.java:1541) ~[?:?]
* at org.apache.pdfbox.util.DateConverter.parseSimpleDate(DateConverter.java:587) ~[pdfbox-2.0.12.jar:2.0.12]
* at org.apache.pdfbox.util.DateConverter.parseDate(DateConverter.java:658) ~[pdfbox-2.0.12.jar:2.0.12]
* at org.apache.pdfbox.util.DateConverter.toCalendar(DateConverter.java:723) ~[pdfbox-2.0.12.jar:2.0.12]
* at org.apache.pdfbox.util.DateConverter.toCalendar(DateConverter.java:701) ~[pdfbox-2.0.12.jar:2.0.12]
* at org.apache.pdfbox.cos.COSDictionary.getDate(COSDictionary.java:790) ~[pdfbox-2.0.12.jar:2.0.12]
* at org.apache.pdfbox.pdmodel.PDDocumentInformation.getCreationDate(PDDocumentInformation.java:212) ~[pdfbox-2.0.12.jar:2.0.12]
*/
logger.log(Level.SEVERE, e.getMessage(), e);
}
}
public Map<String, Object> getResult() {
return result;
}
public Map<String, Object> analyze(int i, PDPage page) throws IOException {
Map<String, Object> m = new LinkedHashMap<>();
m.put("page", i);
m.put("bbox", Map.of("height", page.getBBox().getHeight(), "width", page.getBBox().getWidth()));
m.put("cropbox", Map.of("height", page.getCropBox().getHeight(), "width", page.getCropBox().getWidth()));
m.put("mediabox", Map.of("height", page.getMediaBox().getHeight(), "width", page.getMediaBox().getWidth()));
m.put("bleedbox", Map.of("height", page.getBleedBox().getHeight(), "width", page.getBleedBox().getWidth()));
m.put("rotation", page.getRotation());
List<Map<String, Object>> list = new ArrayList<>();
ImageGraphicsExtractor extractor = new ImageGraphicsExtractor(list, page);
extractor.process();
m.put("images", list);
List<Map<String, Object>> fonts = new ArrayList<>();
PDResources res = page.getResources();
for (COSName cosName : res.getFontNames()) {
PDFont font = res.getFont(cosName);
if (font != null) {
Map<String, Object> f = new LinkedHashMap<>();
f.put("name", font.getName());
f.put("damaged", font.isDamaged());
f.put("embedded", font.isEmbedded());
f.put("type", font.getType());
f.put("subtype", font.getSubType());
fonts.add(f);
}
}
m.put("fonts", fonts);
return m;
}
class ImageGraphicsExtractor extends PDFGraphicsStreamEngine {
private final List<Map<String, Object>> list;
protected ImageGraphicsExtractor(List<Map<String, Object>> list, PDPage page) {
super(page);
this.list = list;
}
public void process() throws IOException {
processPage(getPage());
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) {
}
@Override
public void drawImage(PDImage pdImage) throws IOException {
if (pdImage instanceof PDImageXObject xobject) {
if (seen.contains(xobject.getCOSObject())) {
// skip duplicate image
return;
}
seen.add(xobject.getCOSObject());
Map<String, Object> m = new LinkedHashMap<>();
m.put("width", xobject.getWidth());
m.put("height", xobject.getHeight());
m.put("bitspercomponent", xobject.getBitsPerComponent());
m.put("colorspace", xobject.getColorSpace().getName());
m.put("suffix", xobject.getSuffix());
list.add(m);
}
}
@Override
public void clip(int windingRule) {
}
@Override
public void moveTo(float x, float y) throws IOException {
}
@Override
public void lineTo(float x, float y) throws IOException {
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) {
}
@Override
public Point2D getCurrentPoint() {
return null;
}
@Override
public void closePath() {
}
@Override
public void endPath() {
}
@Override
public void strokePath() {
}
@Override
public void fillPath(int windingRule) {
}
@Override
public void fillAndStrokePath(int windingRule) {
}
@Override
public void shadingFill(COSName shadingName) {
}
}
}

View file

@ -0,0 +1,33 @@
package org.xbib.graphics.pdfbox.test;
import org.junit.jupiter.api.Test;
import org.xbib.graphics.pdfbox.analyze.DocumentAnalyzer;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.logging.Level;
import java.util.logging.Logger;
public class DocumentAnalyzerTest {
private static final Logger logger = Logger.getLogger(DocumentAnalyzerTest.class.getName());
@Test
public void testDocument() throws IOException {
Path tmp = Files.createTempDirectory("document-analyzer");
Path path = tmp.resolve("antonio_sample.pdf");
try (InputStream inputStream = getClass().getResourceAsStream("antonio_sample.pdf");
OutputStream outputStream = Files.newOutputStream(path)) {
if (inputStream != null) {
inputStream.transferTo(outputStream);
DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer(path.toFile());
logger.log(Level.INFO, "result = " + documentAnalyzer.getResult());
}
}
Files.delete(path);
Files.delete(tmp);
}
}