diff --git a/gradle.properties b/gradle.properties index 82d35ab..8d976cf 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ group = org.xbib.graphics name = graphics -version = 5.0.0 +version = 5.0.1 org.gradle.warning.mode = ALL diff --git a/graphics-pdfbox/src/main/java/module-info.java b/graphics-pdfbox/src/main/java/module-info.java index 4354548..f557362 100644 --- a/graphics-pdfbox/src/main/java/module-info.java +++ b/graphics-pdfbox/src/main/java/module-info.java @@ -1,5 +1,6 @@ module org.xbib.graphics.pdfbox { exports org.xbib.graphics.pdfbox; + exports org.xbib.graphics.pdfbox.analyze; exports org.xbib.graphics.pdfbox.color; exports org.xbib.graphics.pdfbox.draw; exports org.xbib.graphics.pdfbox.font; diff --git a/graphics-pdfbox/src/main/java/org/xbib/graphics/pdfbox/analyze/DocumentAnalyzer.java b/graphics-pdfbox/src/main/java/org/xbib/graphics/pdfbox/analyze/DocumentAnalyzer.java new file mode 100644 index 0000000..a6c2b57 --- /dev/null +++ b/graphics-pdfbox/src/main/java/org/xbib/graphics/pdfbox/analyze/DocumentAnalyzer.java @@ -0,0 +1,208 @@ +package org.xbib.graphics.pdfbox.analyze; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.graphics.image.PDImage; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; + +import java.awt.geom.Point2D; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; + +public class DocumentAnalyzer { + + private static final Logger logger = Logger.getLogger(DocumentAnalyzer.class.getName()); + + private final Map result = new LinkedHashMap<>(); + + private final Set seen = new HashSet<>(); + + @SuppressWarnings("unchecked") + public DocumentAnalyzer(File file) { + try (PDDocument document = Loader.loadPDF(file)) { + documentToResult(document); + List> pages = new ArrayList<>(); + int imagecount = 0; + int pagecount = document.getNumberOfPages(); + for (int i = 0; i < pagecount; i++) { + PDPage pdPage = document.getPage(i); + Map pageMap = analyze(i, pdPage); + List> list = (List>) pageMap.get("images"); + imagecount += list.size(); + pages.add(pageMap); + } + result.put("pages", pages); + result.put("imagecount", imagecount); + } catch (Exception e) { + logger.log(Level.SEVERE, e.getMessage(), e); + } + } + + private void documentToResult(PDDocument document) { + try { + result.put("author", document.getDocumentInformation().getAuthor()); + result.put("creator", document.getDocumentInformation().getCreator()); + result.put("producer", document.getDocumentInformation().getProducer()); + result.put("title", document.getDocumentInformation().getTitle()); + result.put("pagecount", document.getNumberOfPages()); + Calendar calendar = document.getDocumentInformation().getCreationDate(); + if (calendar != null) { + result.put("creationDate", calendar.toInstant()); + } + calendar = document.getDocumentInformation().getModificationDate(); + if (calendar != null) { + result.put("modificationDate", calendar.toInstant()); + } + } catch (Exception e) { + // NPE if creation/modification dates are borked + /** + * java.lang.NullPointerException: null + * at java.text.SimpleDateFormat.matchZoneString(SimpleDateFormat.java:1695) ~[?:?] + * at java.text.SimpleDateFormat.subParseZoneString(SimpleDateFormat.java:1763) ~[?:?] + * at java.text.SimpleDateFormat.subParse(SimpleDateFormat.java:2169) ~[?:?] + * at java.text.SimpleDateFormat.parse(SimpleDateFormat.java:1541) ~[?:?] + * at org.apache.pdfbox.util.DateConverter.parseSimpleDate(DateConverter.java:587) ~[pdfbox-2.0.12.jar:2.0.12] + * at org.apache.pdfbox.util.DateConverter.parseDate(DateConverter.java:658) ~[pdfbox-2.0.12.jar:2.0.12] + * at org.apache.pdfbox.util.DateConverter.toCalendar(DateConverter.java:723) ~[pdfbox-2.0.12.jar:2.0.12] + * at org.apache.pdfbox.util.DateConverter.toCalendar(DateConverter.java:701) ~[pdfbox-2.0.12.jar:2.0.12] + * at org.apache.pdfbox.cos.COSDictionary.getDate(COSDictionary.java:790) ~[pdfbox-2.0.12.jar:2.0.12] + * at org.apache.pdfbox.pdmodel.PDDocumentInformation.getCreationDate(PDDocumentInformation.java:212) ~[pdfbox-2.0.12.jar:2.0.12] + */ + logger.log(Level.SEVERE, e.getMessage(), e); + } + } + + public Map getResult() { + return result; + } + + public Map analyze(int i, PDPage page) throws IOException { + Map m = new LinkedHashMap<>(); + m.put("page", i); + m.put("bbox", Map.of("height", page.getBBox().getHeight(), "width", page.getBBox().getWidth())); + m.put("cropbox", Map.of("height", page.getCropBox().getHeight(), "width", page.getCropBox().getWidth())); + m.put("mediabox", Map.of("height", page.getMediaBox().getHeight(), "width", page.getMediaBox().getWidth())); + m.put("bleedbox", Map.of("height", page.getBleedBox().getHeight(), "width", page.getBleedBox().getWidth())); + m.put("rotation", page.getRotation()); + List> list = new ArrayList<>(); + ImageGraphicsExtractor extractor = new ImageGraphicsExtractor(list, page); + extractor.process(); + m.put("images", list); + List> fonts = new ArrayList<>(); + PDResources res = page.getResources(); + for (COSName cosName : res.getFontNames()) { + PDFont font = res.getFont(cosName); + if (font != null) { + Map f = new LinkedHashMap<>(); + f.put("name", font.getName()); + f.put("damaged", font.isDamaged()); + f.put("embedded", font.isEmbedded()); + f.put("type", font.getType()); + f.put("subtype", font.getSubType()); + fonts.add(f); + } + } + m.put("fonts", fonts); + return m; + } + + class ImageGraphicsExtractor extends PDFGraphicsStreamEngine { + + private final List> list; + + protected ImageGraphicsExtractor(List> list, PDPage page) { + super(page); + this.list = list; + } + + public void process() throws IOException { + processPage(getPage()); + } + + @Override + public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) { + + } + + @Override + public void drawImage(PDImage pdImage) throws IOException { + if (pdImage instanceof PDImageXObject xobject) { + if (seen.contains(xobject.getCOSObject())) { + // skip duplicate image + return; + } + seen.add(xobject.getCOSObject()); + Map m = new LinkedHashMap<>(); + m.put("width", xobject.getWidth()); + m.put("height", xobject.getHeight()); + m.put("bitspercomponent", xobject.getBitsPerComponent()); + m.put("colorspace", xobject.getColorSpace().getName()); + m.put("suffix", xobject.getSuffix()); + list.add(m); + } + } + + @Override + public void clip(int windingRule) { + } + + @Override + public void moveTo(float x, float y) throws IOException { + + } + + @Override + public void lineTo(float x, float y) throws IOException { + + } + + @Override + public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) { + } + + @Override + public Point2D getCurrentPoint() { + return null; + } + + @Override + public void closePath() { + } + + @Override + public void endPath() { + } + + @Override + public void strokePath() { + } + + @Override + public void fillPath(int windingRule) { + } + + @Override + public void fillAndStrokePath(int windingRule) { + } + + @Override + public void shadingFill(COSName shadingName) { + } + } +} + diff --git a/graphics-pdfbox/src/test/java/org/xbib/graphics/pdfbox/test/DocumentAnalyzerTest.java b/graphics-pdfbox/src/test/java/org/xbib/graphics/pdfbox/test/DocumentAnalyzerTest.java new file mode 100644 index 0000000..a412755 --- /dev/null +++ b/graphics-pdfbox/src/test/java/org/xbib/graphics/pdfbox/test/DocumentAnalyzerTest.java @@ -0,0 +1,33 @@ +package org.xbib.graphics.pdfbox.test; + +import org.junit.jupiter.api.Test; +import org.xbib.graphics.pdfbox.analyze.DocumentAnalyzer; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.logging.Level; +import java.util.logging.Logger; + +public class DocumentAnalyzerTest { + + private static final Logger logger = Logger.getLogger(DocumentAnalyzerTest.class.getName()); + + @Test + public void testDocument() throws IOException { + Path tmp = Files.createTempDirectory("document-analyzer"); + Path path = tmp.resolve("antonio_sample.pdf"); + try (InputStream inputStream = getClass().getResourceAsStream("antonio_sample.pdf"); + OutputStream outputStream = Files.newOutputStream(path)) { + if (inputStream != null) { + inputStream.transferTo(outputStream); + DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer(path.toFile()); + logger.log(Level.INFO, "result = " + documentAnalyzer.getResult()); + } + } + Files.delete(path); + Files.delete(tmp); + } +}