From 67609394cc79380ef014d194e3d350610a416c81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Prante?= Date: Mon, 4 Dec 2023 15:16:22 +0100 Subject: [PATCH] enhance document analyzer --- gradle.properties | 2 +- .../pdfbox/analyze/DocumentAnalyzer.java | 299 ++++++++++++++++-- .../pdfbox/test/DocumentAnalyzerTest.java | 10 +- 3 files changed, 286 insertions(+), 25 deletions(-) diff --git a/gradle.properties b/gradle.properties index cc5c1ec..c3147a5 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ group = org.xbib.graphics name = graphics -version = 5.0.2 +version = 5.0.3 org.gradle.warning.mode = ALL diff --git a/graphics-pdfbox/src/main/java/org/xbib/graphics/pdfbox/analyze/DocumentAnalyzer.java b/graphics-pdfbox/src/main/java/org/xbib/graphics/pdfbox/analyze/DocumentAnalyzer.java index 006ccdd..956818e 100644 --- a/graphics-pdfbox/src/main/java/org/xbib/graphics/pdfbox/analyze/DocumentAnalyzer.java +++ b/graphics-pdfbox/src/main/java/org/xbib/graphics/pdfbox/analyze/DocumentAnalyzer.java @@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.graphics.image.PDImage; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; @@ -19,6 +20,7 @@ import java.util.ArrayList; import java.util.Calendar; import java.util.HashSet; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -29,22 +31,30 @@ public class DocumentAnalyzer { private static final Logger logger = Logger.getLogger(DocumentAnalyzer.class.getName()); - private final Map result = new LinkedHashMap<>(); + private final Map result; - private final Set seen = new HashSet<>(); + public DocumentAnalyzer() { + result = new LinkedHashMap<>(); + } @SuppressWarnings("unchecked") - public DocumentAnalyzer(File file) { + public void process(File file) { + result.clear(); + Set seen = new HashSet<>(); try (PDDocument document = Loader.loadPDF(file)) { - documentToResult(document); + documentInformationToResult(document.getDocumentInformation()); List> pages = new ArrayList<>(); - int imagecount = 0; + int documentimagecount = 0; int pagecount = document.getNumberOfPages(); boolean isDocumentColor = false; boolean isDocumentGray = false; + boolean isDocumentA4 = true; + boolean isDocumentLetter = true; + boolean isDocumentLandscape = false; + boolean isDocumentImage = true; for (int i = 0; i < pagecount; i++) { PDPage pdPage = document.getPage(i); - Map pageMap = analyze(i, pdPage); + Map pageMap = analyzePage(i, pdPage, seen); boolean isColor = (boolean) pageMap.get("iscolor"); if (isColor) { isDocumentColor = true; @@ -53,13 +63,28 @@ public class DocumentAnalyzer { if (isGray) { isDocumentGray = true; } + boolean isA4 = (boolean) pageMap.get("isa4"); + isDocumentA4 = isDocumentA4 && isA4; + boolean isLetter = (boolean) pageMap.get("isletter"); + isDocumentLetter = isDocumentLetter && isLetter; + boolean isLandscape = (boolean) pageMap.get("islandscape"); + if (isLandscape) { + isDocumentLandscape = true; + } List> list = (List>) pageMap.get("images"); - imagecount += list.size(); + int imagecount = list.size(); + documentimagecount += imagecount; + isDocumentImage = isDocumentImage && (imagecount == 1); pages.add(pageMap); } - result.put("imagecount", imagecount); + result.put("pagecount", pagecount); + result.put("imagecount", documentimagecount); + result.put("isimage", pagecount > 0 && isDocumentImage); result.put("iscolor", isDocumentColor); result.put("isgray", isDocumentGray); + result.put("isa4", pagecount > 0 && isDocumentA4); + result.put("isletter", pagecount > 0 && isDocumentLetter); + result.put("islandscape", isDocumentLandscape); result.put("pages", pages); } catch (Exception e) { logger.log(Level.WARNING, e.getMessage(), e); @@ -70,23 +95,50 @@ public class DocumentAnalyzer { return result; } + public boolean isValid() { + return !result.isEmpty(); + } + + public int getPageCount() { + return (int) result.get("pagecount"); + } + public boolean isColor() { return (boolean) result.get("iscolor"); } - public boolean isGray() { return (boolean) result.get("isgray"); } - private void documentToResult(PDDocument document) { + public boolean isA4() { + return (boolean) result.get("isa4"); + } + + public boolean isLetter() { + return (boolean) result.get("isletter"); + } + + public boolean isLandscape() { + return (boolean) result.get("islandscape"); + } + + public boolean isImage() { + return (boolean) result.get("isimage"); + } + + @SuppressWarnings("unchecked") + public Map getPage(int pageNumber) { + List> pages = (List>) result.get("pages"); + return pages.get(pageNumber); + } + + private void documentInformationToResult(PDDocumentInformation documentInformation) { try { - PDDocumentInformation documentInformation = document.getDocumentInformation(); result.put("author", documentInformation.getAuthor()); result.put("creator", documentInformation.getCreator()); result.put("producer", documentInformation.getProducer()); result.put("title", documentInformation.getTitle()); - result.put("pagecount", document.getNumberOfPages()); Calendar calendar = documentInformation.getCreationDate(); if (calendar != null) { result.put("creationDate", calendar.toInstant()); @@ -114,15 +166,16 @@ public class DocumentAnalyzer { } } - public Map analyze(int i, PDPage page) throws IOException { + private Map analyzePage(int i, PDPage page, Set seen) throws IOException { Map m = new LinkedHashMap<>(); m.put("page", i); m.put("bbox", Map.of("height", page.getBBox().getHeight(), "width", page.getBBox().getWidth())); m.put("cropbox", Map.of("height", page.getCropBox().getHeight(), "width", page.getCropBox().getWidth())); m.put("mediabox", Map.of("height", page.getMediaBox().getHeight(), "width", page.getMediaBox().getWidth())); m.put("bleedbox", Map.of("height", page.getBleedBox().getHeight(), "width", page.getBleedBox().getWidth())); + checkBoxDimensions(page, m); m.put("rotation", page.getRotation()); - PageExtractor pageExtractor = new PageExtractor(page); + PageExtractor pageExtractor = new PageExtractor(page, seen); pageExtractor.process(); m.put("images", pageExtractor.getImages()); m.put("iscolor", pageExtractor.isColor()); @@ -133,16 +186,219 @@ public class DocumentAnalyzer { return m; } - class PageExtractor extends PDFGraphicsStreamEngine { + private void checkBoxDimensions(PDPage page, Map m) { + List isA4 = new ArrayList<>(); + List isLetter = new ArrayList<>(); + List isLandscape= new ArrayList<>(); + List.of(Map.entry("bbox", page.getBBox()), + Map.entry("cropbox", page.getCropBox()), + Map.entry("mediabox", page.getMediaBox()), + Map.entry("bleedbox", page.getBleedBox())).forEach(e -> { + String boxName = e.getKey(); + PDRectangle rect = e.getValue(); + Set set = new LinkedHashSet<>(); + if (isA0(rect)) { + set.add("A0"); + isLandscape.add(false); + } else if (isA0Landscape(rect)) { + set.add("A0 Landscape"); + isLandscape.add(true); + } + if (isA1(rect)) { + set.add("A1"); + isLandscape.add(false); + } else if (isA1Landscape(rect)) { + set.add("A1 Landscape"); + isLandscape.add(true); + } + if (isA2(rect)) { + set.add("A2"); + isLandscape.add(false); + } else if (isA2Landscape(rect)) { + set.add("A2 Landscape"); + isLandscape.add(true); + } + if (isA3(rect)) { + set.add("A3"); + isLandscape.add(false); + } else if (isA3Landscape(rect)) { + set.add("A3 Landscape"); + isLandscape.add(true); + } + if (isA4(rect)) { + set.add("A4"); + isA4.add(true); + isLandscape.add(false); + } else if (isA4Landscape(rect)) { + set.add("A4 Landscape"); + isA4.add(true); + isLandscape.add(true); + } else { + isA4.add(false); + } + if (isA5(rect)) { + set.add("A5"); + isLandscape.add(false); + } else if (isA5Landscape(rect)) { + set.add("A5 Landscape"); + isLandscape.add(true); + } + if (isA6(rect)) { + set.add("A6"); + isLandscape.add(false); + } else if (isA6Landscape(rect)) { + set.add("A6 Landscape"); + isLandscape.add(true); + } + if (isLetter(rect)) { + set.add("LETTER"); + isLetter.add(true); + isLandscape.add(false); + } else if (isLetterLandscape(rect)) { + set.add("LETTER Landscape"); + isLetter.add(true); + isLandscape.add(true); + } else { + isLetter.add(false); + } + if (isLegal(rect)) { + set.add("LEGAL"); + isLandscape.add(false); + } else if (isLegalLandscape(rect)) { + set.add("LEGAL Landscape"); + isLandscape.add(true); + } + if (isTabloid(rect)) { + set.add("TABLOID"); + isLandscape.add(false); + } else if (isTabloidLandscape(rect)) { + set.add("TABLOID Landscape"); + isLandscape.add(true); + } + m.put(boxName + "dimensions", set); + }); + m.put("isa4", !isA4.isEmpty() && isA4.stream().allMatch(b -> b == Boolean.TRUE)); + m.put("isletter", !isLandscape.isEmpty() && isLetter.stream().allMatch(b -> b == Boolean.TRUE)); + m.put("islandscape", !isLandscape.isEmpty() && isLandscape.stream().allMatch(b -> b == Boolean.TRUE)); + } + + private boolean isA0(PDRectangle rectangle) { + return compareFloat(rectangle.getHeight(), PDRectangle.A0.getHeight()) && + compareFloat(rectangle.getWidth(), PDRectangle.A0.getWidth()); + } + + private boolean isA0Landscape(PDRectangle rectangle) { + return compareFloat(rectangle.getWidth(), PDRectangle.A0.getHeight()) && + compareFloat(rectangle.getHeight(), PDRectangle.A0.getWidth()); + } + + private boolean isA1(PDRectangle rectangle) { + return compareFloat(rectangle.getHeight(), PDRectangle.A1.getHeight()) && + compareFloat(rectangle.getWidth(), PDRectangle.A1.getWidth()); + } + + private boolean isA1Landscape(PDRectangle rectangle) { + return compareFloat(rectangle.getWidth(), PDRectangle.A1.getHeight()) && + compareFloat(rectangle.getHeight(), PDRectangle.A1.getWidth()); + } + + private boolean isA2(PDRectangle rectangle) { + return compareFloat(rectangle.getHeight(), PDRectangle.A2.getHeight()) && + compareFloat(rectangle.getWidth(), PDRectangle.A2.getWidth()); + } + + private boolean isA2Landscape(PDRectangle rectangle) { + return compareFloat(rectangle.getWidth(), PDRectangle.A2.getHeight()) && + compareFloat(rectangle.getHeight(), PDRectangle.A2.getWidth()); + } + + private boolean isA3(PDRectangle rectangle) { + return compareFloat(rectangle.getHeight(), PDRectangle.A3.getHeight()) && + compareFloat(rectangle.getWidth(), PDRectangle.A3.getWidth()); + } + + private boolean isA3Landscape(PDRectangle rectangle) { + return compareFloat(rectangle.getWidth(), PDRectangle.A3.getHeight()) && + compareFloat(rectangle.getHeight(), PDRectangle.A3.getWidth()); + } + + private boolean isA4(PDRectangle rectangle) { + return compareFloat(rectangle.getHeight(),PDRectangle.A4.getHeight()) && + compareFloat(rectangle.getWidth(), PDRectangle.A4.getWidth()); + } + + private boolean isA4Landscape(PDRectangle rectangle) { + return compareFloat(rectangle.getWidth(),PDRectangle.A4.getHeight()) && + compareFloat(rectangle.getHeight(), PDRectangle.A4.getWidth()); + } + + private boolean isA5(PDRectangle rectangle) { + return compareFloat(rectangle.getHeight(), PDRectangle.A5.getHeight()) && + compareFloat(rectangle.getWidth(), PDRectangle.A5.getWidth()); + } + + private boolean isA5Landscape(PDRectangle rectangle) { + return compareFloat(rectangle.getWidth(), PDRectangle.A5.getHeight()) && + compareFloat(rectangle.getHeight(), PDRectangle.A5.getWidth()); + } + + private boolean isA6(PDRectangle rectangle) { + return compareFloat(rectangle.getHeight(), PDRectangle.A6.getHeight()) && + compareFloat(rectangle.getWidth(), PDRectangle.A6.getWidth()); + } + + private boolean isA6Landscape(PDRectangle rectangle) { + return compareFloat(rectangle.getHeight(), PDRectangle.A6.getHeight()) && + compareFloat(rectangle.getWidth(), PDRectangle.A6.getWidth()); + } + + private boolean isLetter(PDRectangle rectangle) { + return compareFloat(rectangle.getHeight(), PDRectangle.LETTER.getHeight()) && + compareFloat(rectangle.getWidth(), PDRectangle.LETTER.getWidth()); + } + + private boolean isLetterLandscape(PDRectangle rectangle) { + return compareFloat(rectangle.getWidth(), PDRectangle.LETTER.getHeight()) && + compareFloat(rectangle.getHeight(), PDRectangle.LETTER.getWidth()); + } + + private boolean isLegal(PDRectangle rectangle) { + return compareFloat(rectangle.getHeight(), PDRectangle.LEGAL.getHeight()) && + compareFloat(rectangle.getWidth(), PDRectangle.LEGAL.getWidth()); + } + + private boolean isLegalLandscape(PDRectangle rectangle) { + return compareFloat(rectangle.getWidth(), PDRectangle.LEGAL.getHeight()) && + compareFloat(rectangle.getHeight(), PDRectangle.LEGAL.getWidth()); + } + + private boolean isTabloid(PDRectangle rectangle) { + return compareFloat(rectangle.getHeight(), PDRectangle.TABLOID.getHeight()) && + compareFloat(rectangle.getWidth(), PDRectangle.TABLOID.getWidth()); + } + + private boolean isTabloidLandscape(PDRectangle rectangle) { + return compareFloat(rectangle.getWidth(), PDRectangle.TABLOID.getHeight()) && + compareFloat(rectangle.getHeight(), PDRectangle.TABLOID.getWidth()); + } + + private boolean compareFloat(float f1, float f2) { + return f1 == f2 || (Math.abs(f2 - f1) < 1.0f); + } + + private static class PageExtractor extends PDFGraphicsStreamEngine { private final List> images; + private final Set seen; + private boolean isColor; private boolean isGray; - protected PageExtractor(PDPage page) { + private PageExtractor(PDPage page, Set seen) { super(page); + this.seen = seen; this.images = new ArrayList<>(); this.isColor = false; this.isGray = false; @@ -204,11 +460,11 @@ public class DocumentAnalyzer { } @Override - public void moveTo(float x, float y) throws IOException { + public void moveTo(float x, float y) { } @Override - public void lineTo(float x, float y) throws IOException { + public void lineTo(float x, float y) { } @Override @@ -232,7 +488,6 @@ public class DocumentAnalyzer { public void strokePath() { String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName(); if (isColorSpaceName(colorSpaceName)) { - logger.log(Level.INFO, "strokepath: color true, " + colorSpaceName); this.isColor = true; } if (isGraySpaceName(colorSpaceName)) { @@ -244,7 +499,6 @@ public class DocumentAnalyzer { public void fillPath(int windingRule) { String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName(); if (isColorSpaceName(colorSpaceName)) { - logger.log(Level.INFO, "fillpath: color true " + colorSpaceName); this.isColor = true; } if (isGraySpaceName(colorSpaceName)) { @@ -263,6 +517,7 @@ public class DocumentAnalyzer { private boolean isColorSpaceName(String name) { return "DeviceRGB".equals(name) || "DeviceCMYK".equals(name) || + "ICCBased".equals(name) || "Indexed".equals(name); } @@ -271,13 +526,13 @@ public class DocumentAnalyzer { } } - static class FontExtractor { + private static class FontExtractor { private final List> fonts; private final PDResources res; - public FontExtractor(PDPage page) { + private FontExtractor(PDPage page) { fonts = new ArrayList<>(); res = page.getResources(); } diff --git a/graphics-pdfbox/src/test/java/org/xbib/graphics/pdfbox/test/DocumentAnalyzerTest.java b/graphics-pdfbox/src/test/java/org/xbib/graphics/pdfbox/test/DocumentAnalyzerTest.java index aeec787..05d64f3 100644 --- a/graphics-pdfbox/src/test/java/org/xbib/graphics/pdfbox/test/DocumentAnalyzerTest.java +++ b/graphics-pdfbox/src/test/java/org/xbib/graphics/pdfbox/test/DocumentAnalyzerTest.java @@ -18,16 +18,22 @@ public class DocumentAnalyzerTest { @Test public void testDocument() throws IOException { Path tmp = Files.createTempDirectory("document-analyzer"); - String sample = "antonio_sample.pdf"; + String sample = "20200000063.pdf"; Path path = tmp.resolve(sample); try (InputStream inputStream = getClass().getResourceAsStream(sample); OutputStream outputStream = Files.newOutputStream(path)) { if (inputStream != null) { inputStream.transferTo(outputStream); - DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer(path.toFile()); + DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer(); + documentAnalyzer.process(path.toFile()); logger.log(Level.INFO, "result = " + documentAnalyzer.getResult()); + logger.log(Level.INFO, "isvalid = " + documentAnalyzer.isValid()); logger.log(Level.INFO, "iscolor = " + documentAnalyzer.isColor()); logger.log(Level.INFO, "isgray = " + documentAnalyzer.isGray()); + logger.log(Level.INFO, "isA4 = " + documentAnalyzer.isA4()); + logger.log(Level.INFO, "isLetter = " + documentAnalyzer.isLetter()); + logger.log(Level.INFO, "islandscape = " + documentAnalyzer.isLandscape()); + logger.log(Level.INFO, "isimage = " + documentAnalyzer.isImage()); } } Files.delete(path);