enhance document analyzer

This commit is contained in:
Jörg Prante 2023-12-04 15:16:22 +01:00
parent b08fa2d137
commit 67609394cc
3 changed files with 286 additions and 25 deletions

View file

@ -1,5 +1,5 @@
group = org.xbib.graphics
name = graphics
version = 5.0.2
version = 5.0.3
org.gradle.warning.mode = ALL

View file

@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
@ -19,6 +20,7 @@ import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -29,22 +31,30 @@ public class DocumentAnalyzer {
private static final Logger logger = Logger.getLogger(DocumentAnalyzer.class.getName());
private final Map<String, Object> result = new LinkedHashMap<>();
private final Map<String, Object> result;
private final Set<COSStream> seen = new HashSet<>();
public DocumentAnalyzer() {
result = new LinkedHashMap<>();
}
@SuppressWarnings("unchecked")
public DocumentAnalyzer(File file) {
public void process(File file) {
result.clear();
Set<COSStream> seen = new HashSet<>();
try (PDDocument document = Loader.loadPDF(file)) {
documentToResult(document);
documentInformationToResult(document.getDocumentInformation());
List<Map<String, Object>> pages = new ArrayList<>();
int imagecount = 0;
int documentimagecount = 0;
int pagecount = document.getNumberOfPages();
boolean isDocumentColor = false;
boolean isDocumentGray = false;
boolean isDocumentA4 = true;
boolean isDocumentLetter = true;
boolean isDocumentLandscape = false;
boolean isDocumentImage = true;
for (int i = 0; i < pagecount; i++) {
PDPage pdPage = document.getPage(i);
Map<String, Object> pageMap = analyze(i, pdPage);
Map<String, Object> pageMap = analyzePage(i, pdPage, seen);
boolean isColor = (boolean) pageMap.get("iscolor");
if (isColor) {
isDocumentColor = true;
@ -53,13 +63,28 @@ public class DocumentAnalyzer {
if (isGray) {
isDocumentGray = true;
}
boolean isA4 = (boolean) pageMap.get("isa4");
isDocumentA4 = isDocumentA4 && isA4;
boolean isLetter = (boolean) pageMap.get("isletter");
isDocumentLetter = isDocumentLetter && isLetter;
boolean isLandscape = (boolean) pageMap.get("islandscape");
if (isLandscape) {
isDocumentLandscape = true;
}
List<Map<String, Object>> list = (List<Map<String, Object>>) pageMap.get("images");
imagecount += list.size();
int imagecount = list.size();
documentimagecount += imagecount;
isDocumentImage = isDocumentImage && (imagecount == 1);
pages.add(pageMap);
}
result.put("imagecount", imagecount);
result.put("pagecount", pagecount);
result.put("imagecount", documentimagecount);
result.put("isimage", pagecount > 0 && isDocumentImage);
result.put("iscolor", isDocumentColor);
result.put("isgray", isDocumentGray);
result.put("isa4", pagecount > 0 && isDocumentA4);
result.put("isletter", pagecount > 0 && isDocumentLetter);
result.put("islandscape", isDocumentLandscape);
result.put("pages", pages);
} catch (Exception e) {
logger.log(Level.WARNING, e.getMessage(), e);
@ -70,23 +95,50 @@ public class DocumentAnalyzer {
return result;
}
public boolean isValid() {
return !result.isEmpty();
}
public int getPageCount() {
return (int) result.get("pagecount");
}
public boolean isColor() {
return (boolean) result.get("iscolor");
}
public boolean isGray() {
return (boolean) result.get("isgray");
}
private void documentToResult(PDDocument document) {
public boolean isA4() {
return (boolean) result.get("isa4");
}
public boolean isLetter() {
return (boolean) result.get("isletter");
}
public boolean isLandscape() {
return (boolean) result.get("islandscape");
}
public boolean isImage() {
return (boolean) result.get("isimage");
}
@SuppressWarnings("unchecked")
public Map<String, Object> getPage(int pageNumber) {
List<Map<String, Object>> pages = (List<Map<String, Object>>) result.get("pages");
return pages.get(pageNumber);
}
private void documentInformationToResult(PDDocumentInformation documentInformation) {
try {
PDDocumentInformation documentInformation = document.getDocumentInformation();
result.put("author", documentInformation.getAuthor());
result.put("creator", documentInformation.getCreator());
result.put("producer", documentInformation.getProducer());
result.put("title", documentInformation.getTitle());
result.put("pagecount", document.getNumberOfPages());
Calendar calendar = documentInformation.getCreationDate();
if (calendar != null) {
result.put("creationDate", calendar.toInstant());
@ -114,15 +166,16 @@ public class DocumentAnalyzer {
}
}
public Map<String, Object> analyze(int i, PDPage page) throws IOException {
private Map<String, Object> analyzePage(int i, PDPage page, Set<COSStream> seen) throws IOException {
Map<String, Object> m = new LinkedHashMap<>();
m.put("page", i);
m.put("bbox", Map.of("height", page.getBBox().getHeight(), "width", page.getBBox().getWidth()));
m.put("cropbox", Map.of("height", page.getCropBox().getHeight(), "width", page.getCropBox().getWidth()));
m.put("mediabox", Map.of("height", page.getMediaBox().getHeight(), "width", page.getMediaBox().getWidth()));
m.put("bleedbox", Map.of("height", page.getBleedBox().getHeight(), "width", page.getBleedBox().getWidth()));
checkBoxDimensions(page, m);
m.put("rotation", page.getRotation());
PageExtractor pageExtractor = new PageExtractor(page);
PageExtractor pageExtractor = new PageExtractor(page, seen);
pageExtractor.process();
m.put("images", pageExtractor.getImages());
m.put("iscolor", pageExtractor.isColor());
@ -133,16 +186,219 @@ public class DocumentAnalyzer {
return m;
}
class PageExtractor extends PDFGraphicsStreamEngine {
private void checkBoxDimensions(PDPage page, Map<String, Object> m) {
List<Boolean> isA4 = new ArrayList<>();
List<Boolean> isLetter = new ArrayList<>();
List<Boolean> isLandscape= new ArrayList<>();
List.of(Map.entry("bbox", page.getBBox()),
Map.entry("cropbox", page.getCropBox()),
Map.entry("mediabox", page.getMediaBox()),
Map.entry("bleedbox", page.getBleedBox())).forEach(e -> {
String boxName = e.getKey();
PDRectangle rect = e.getValue();
Set<String> set = new LinkedHashSet<>();
if (isA0(rect)) {
set.add("A0");
isLandscape.add(false);
} else if (isA0Landscape(rect)) {
set.add("A0 Landscape");
isLandscape.add(true);
}
if (isA1(rect)) {
set.add("A1");
isLandscape.add(false);
} else if (isA1Landscape(rect)) {
set.add("A1 Landscape");
isLandscape.add(true);
}
if (isA2(rect)) {
set.add("A2");
isLandscape.add(false);
} else if (isA2Landscape(rect)) {
set.add("A2 Landscape");
isLandscape.add(true);
}
if (isA3(rect)) {
set.add("A3");
isLandscape.add(false);
} else if (isA3Landscape(rect)) {
set.add("A3 Landscape");
isLandscape.add(true);
}
if (isA4(rect)) {
set.add("A4");
isA4.add(true);
isLandscape.add(false);
} else if (isA4Landscape(rect)) {
set.add("A4 Landscape");
isA4.add(true);
isLandscape.add(true);
} else {
isA4.add(false);
}
if (isA5(rect)) {
set.add("A5");
isLandscape.add(false);
} else if (isA5Landscape(rect)) {
set.add("A5 Landscape");
isLandscape.add(true);
}
if (isA6(rect)) {
set.add("A6");
isLandscape.add(false);
} else if (isA6Landscape(rect)) {
set.add("A6 Landscape");
isLandscape.add(true);
}
if (isLetter(rect)) {
set.add("LETTER");
isLetter.add(true);
isLandscape.add(false);
} else if (isLetterLandscape(rect)) {
set.add("LETTER Landscape");
isLetter.add(true);
isLandscape.add(true);
} else {
isLetter.add(false);
}
if (isLegal(rect)) {
set.add("LEGAL");
isLandscape.add(false);
} else if (isLegalLandscape(rect)) {
set.add("LEGAL Landscape");
isLandscape.add(true);
}
if (isTabloid(rect)) {
set.add("TABLOID");
isLandscape.add(false);
} else if (isTabloidLandscape(rect)) {
set.add("TABLOID Landscape");
isLandscape.add(true);
}
m.put(boxName + "dimensions", set);
});
m.put("isa4", !isA4.isEmpty() && isA4.stream().allMatch(b -> b == Boolean.TRUE));
m.put("isletter", !isLandscape.isEmpty() && isLetter.stream().allMatch(b -> b == Boolean.TRUE));
m.put("islandscape", !isLandscape.isEmpty() && isLandscape.stream().allMatch(b -> b == Boolean.TRUE));
}
private boolean isA0(PDRectangle rectangle) {
return compareFloat(rectangle.getHeight(), PDRectangle.A0.getHeight()) &&
compareFloat(rectangle.getWidth(), PDRectangle.A0.getWidth());
}
private boolean isA0Landscape(PDRectangle rectangle) {
return compareFloat(rectangle.getWidth(), PDRectangle.A0.getHeight()) &&
compareFloat(rectangle.getHeight(), PDRectangle.A0.getWidth());
}
private boolean isA1(PDRectangle rectangle) {
return compareFloat(rectangle.getHeight(), PDRectangle.A1.getHeight()) &&
compareFloat(rectangle.getWidth(), PDRectangle.A1.getWidth());
}
private boolean isA1Landscape(PDRectangle rectangle) {
return compareFloat(rectangle.getWidth(), PDRectangle.A1.getHeight()) &&
compareFloat(rectangle.getHeight(), PDRectangle.A1.getWidth());
}
private boolean isA2(PDRectangle rectangle) {
return compareFloat(rectangle.getHeight(), PDRectangle.A2.getHeight()) &&
compareFloat(rectangle.getWidth(), PDRectangle.A2.getWidth());
}
private boolean isA2Landscape(PDRectangle rectangle) {
return compareFloat(rectangle.getWidth(), PDRectangle.A2.getHeight()) &&
compareFloat(rectangle.getHeight(), PDRectangle.A2.getWidth());
}
private boolean isA3(PDRectangle rectangle) {
return compareFloat(rectangle.getHeight(), PDRectangle.A3.getHeight()) &&
compareFloat(rectangle.getWidth(), PDRectangle.A3.getWidth());
}
private boolean isA3Landscape(PDRectangle rectangle) {
return compareFloat(rectangle.getWidth(), PDRectangle.A3.getHeight()) &&
compareFloat(rectangle.getHeight(), PDRectangle.A3.getWidth());
}
private boolean isA4(PDRectangle rectangle) {
return compareFloat(rectangle.getHeight(),PDRectangle.A4.getHeight()) &&
compareFloat(rectangle.getWidth(), PDRectangle.A4.getWidth());
}
private boolean isA4Landscape(PDRectangle rectangle) {
return compareFloat(rectangle.getWidth(),PDRectangle.A4.getHeight()) &&
compareFloat(rectangle.getHeight(), PDRectangle.A4.getWidth());
}
private boolean isA5(PDRectangle rectangle) {
return compareFloat(rectangle.getHeight(), PDRectangle.A5.getHeight()) &&
compareFloat(rectangle.getWidth(), PDRectangle.A5.getWidth());
}
private boolean isA5Landscape(PDRectangle rectangle) {
return compareFloat(rectangle.getWidth(), PDRectangle.A5.getHeight()) &&
compareFloat(rectangle.getHeight(), PDRectangle.A5.getWidth());
}
private boolean isA6(PDRectangle rectangle) {
return compareFloat(rectangle.getHeight(), PDRectangle.A6.getHeight()) &&
compareFloat(rectangle.getWidth(), PDRectangle.A6.getWidth());
}
private boolean isA6Landscape(PDRectangle rectangle) {
return compareFloat(rectangle.getHeight(), PDRectangle.A6.getHeight()) &&
compareFloat(rectangle.getWidth(), PDRectangle.A6.getWidth());
}
private boolean isLetter(PDRectangle rectangle) {
return compareFloat(rectangle.getHeight(), PDRectangle.LETTER.getHeight()) &&
compareFloat(rectangle.getWidth(), PDRectangle.LETTER.getWidth());
}
private boolean isLetterLandscape(PDRectangle rectangle) {
return compareFloat(rectangle.getWidth(), PDRectangle.LETTER.getHeight()) &&
compareFloat(rectangle.getHeight(), PDRectangle.LETTER.getWidth());
}
private boolean isLegal(PDRectangle rectangle) {
return compareFloat(rectangle.getHeight(), PDRectangle.LEGAL.getHeight()) &&
compareFloat(rectangle.getWidth(), PDRectangle.LEGAL.getWidth());
}
private boolean isLegalLandscape(PDRectangle rectangle) {
return compareFloat(rectangle.getWidth(), PDRectangle.LEGAL.getHeight()) &&
compareFloat(rectangle.getHeight(), PDRectangle.LEGAL.getWidth());
}
private boolean isTabloid(PDRectangle rectangle) {
return compareFloat(rectangle.getHeight(), PDRectangle.TABLOID.getHeight()) &&
compareFloat(rectangle.getWidth(), PDRectangle.TABLOID.getWidth());
}
private boolean isTabloidLandscape(PDRectangle rectangle) {
return compareFloat(rectangle.getWidth(), PDRectangle.TABLOID.getHeight()) &&
compareFloat(rectangle.getHeight(), PDRectangle.TABLOID.getWidth());
}
private boolean compareFloat(float f1, float f2) {
return f1 == f2 || (Math.abs(f2 - f1) < 1.0f);
}
private static class PageExtractor extends PDFGraphicsStreamEngine {
private final List<Map<String, Object>> images;
private final Set<COSStream> seen;
private boolean isColor;
private boolean isGray;
protected PageExtractor(PDPage page) {
private PageExtractor(PDPage page, Set<COSStream> seen) {
super(page);
this.seen = seen;
this.images = new ArrayList<>();
this.isColor = false;
this.isGray = false;
@ -204,11 +460,11 @@ public class DocumentAnalyzer {
}
@Override
public void moveTo(float x, float y) throws IOException {
public void moveTo(float x, float y) {
}
@Override
public void lineTo(float x, float y) throws IOException {
public void lineTo(float x, float y) {
}
@Override
@ -232,7 +488,6 @@ public class DocumentAnalyzer {
public void strokePath() {
String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName();
if (isColorSpaceName(colorSpaceName)) {
logger.log(Level.INFO, "strokepath: color true, " + colorSpaceName);
this.isColor = true;
}
if (isGraySpaceName(colorSpaceName)) {
@ -244,7 +499,6 @@ public class DocumentAnalyzer {
public void fillPath(int windingRule) {
String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName();
if (isColorSpaceName(colorSpaceName)) {
logger.log(Level.INFO, "fillpath: color true " + colorSpaceName);
this.isColor = true;
}
if (isGraySpaceName(colorSpaceName)) {
@ -263,6 +517,7 @@ public class DocumentAnalyzer {
private boolean isColorSpaceName(String name) {
return "DeviceRGB".equals(name) ||
"DeviceCMYK".equals(name) ||
"ICCBased".equals(name) ||
"Indexed".equals(name);
}
@ -271,13 +526,13 @@ public class DocumentAnalyzer {
}
}
static class FontExtractor {
private static class FontExtractor {
private final List<Map<String, Object>> fonts;
private final PDResources res;
public FontExtractor(PDPage page) {
private FontExtractor(PDPage page) {
fonts = new ArrayList<>();
res = page.getResources();
}

View file

@ -18,16 +18,22 @@ public class DocumentAnalyzerTest {
@Test
public void testDocument() throws IOException {
Path tmp = Files.createTempDirectory("document-analyzer");
String sample = "antonio_sample.pdf";
String sample = "20200000063.pdf";
Path path = tmp.resolve(sample);
try (InputStream inputStream = getClass().getResourceAsStream(sample);
OutputStream outputStream = Files.newOutputStream(path)) {
if (inputStream != null) {
inputStream.transferTo(outputStream);
DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer(path.toFile());
DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer();
documentAnalyzer.process(path.toFile());
logger.log(Level.INFO, "result = " + documentAnalyzer.getResult());
logger.log(Level.INFO, "isvalid = " + documentAnalyzer.isValid());
logger.log(Level.INFO, "iscolor = " + documentAnalyzer.isColor());
logger.log(Level.INFO, "isgray = " + documentAnalyzer.isGray());
logger.log(Level.INFO, "isA4 = " + documentAnalyzer.isA4());
logger.log(Level.INFO, "isLetter = " + documentAnalyzer.isLetter());
logger.log(Level.INFO, "islandscape = " + documentAnalyzer.isLandscape());
logger.log(Level.INFO, "isimage = " + documentAnalyzer.isImage());
}
}
Files.delete(path);