add color/gray detection in document analyzer

This commit is contained in:
Jörg Prante 2023-11-27 18:44:24 +01:00
parent 319dd41658
commit b08fa2d137
5 changed files with 150 additions and 50 deletions

View file

@ -1,5 +1,5 @@
group = org.xbib.graphics
name = graphics
version = 5.0.1
version = 5.0.2
org.gradle.warning.mode = ALL

View file

@ -13,6 +13,7 @@ test {
file('/var/tmp/gs').mkdirs()
systemProperty 'java.awt.headless', 'true'
systemProperty 'java.io.tmpdir', '/var/tmp/'
systemProperty 'pdfbox.fontcache', '/var/tmp/pdfbox'
systemProperty 'jna.tmpdir', '/var/tmp/'
systemProperty 'jna.debug', 'true'
systemProperty 'java.util.logging.config.file', 'src/test/resources/logging.properties'

View file

@ -5,11 +5,10 @@ import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
@ -41,38 +40,64 @@ public class DocumentAnalyzer {
List<Map<String, Object>> pages = new ArrayList<>();
int imagecount = 0;
int pagecount = document.getNumberOfPages();
boolean isDocumentColor = false;
boolean isDocumentGray = false;
for (int i = 0; i < pagecount; i++) {
PDPage pdPage = document.getPage(i);
Map<String, Object> pageMap = analyze(i, pdPage);
boolean isColor = (boolean) pageMap.get("iscolor");
if (isColor) {
isDocumentColor = true;
}
boolean isGray = (boolean) pageMap.get("isgray");
if (isGray) {
isDocumentGray = true;
}
List<Map<String, Object>> list = (List<Map<String, Object>>) pageMap.get("images");
imagecount += list.size();
pages.add(pageMap);
}
result.put("pages", pages);
result.put("imagecount", imagecount);
result.put("iscolor", isDocumentColor);
result.put("isgray", isDocumentGray);
result.put("pages", pages);
} catch (Exception e) {
logger.log(Level.SEVERE, e.getMessage(), e);
logger.log(Level.WARNING, e.getMessage(), e);
}
}
public Map<String, Object> getResult() {
return result;
}
public boolean isColor() {
return (boolean) result.get("iscolor");
}
public boolean isGray() {
return (boolean) result.get("isgray");
}
private void documentToResult(PDDocument document) {
try {
result.put("author", document.getDocumentInformation().getAuthor());
result.put("creator", document.getDocumentInformation().getCreator());
result.put("producer", document.getDocumentInformation().getProducer());
result.put("title", document.getDocumentInformation().getTitle());
PDDocumentInformation documentInformation = document.getDocumentInformation();
result.put("author", documentInformation.getAuthor());
result.put("creator", documentInformation.getCreator());
result.put("producer", documentInformation.getProducer());
result.put("title", documentInformation.getTitle());
result.put("pagecount", document.getNumberOfPages());
Calendar calendar = document.getDocumentInformation().getCreationDate();
Calendar calendar = documentInformation.getCreationDate();
if (calendar != null) {
result.put("creationDate", calendar.toInstant());
}
calendar = document.getDocumentInformation().getModificationDate();
calendar = documentInformation.getModificationDate();
if (calendar != null) {
result.put("modificationDate", calendar.toInstant());
}
} catch (Exception e) {
// NPE if creation/modification dates are borked
/**
/*
* java.lang.NullPointerException: null
* at java.text.SimpleDateFormat.matchZoneString(SimpleDateFormat.java:1695) ~[?:?]
* at java.text.SimpleDateFormat.subParseZoneString(SimpleDateFormat.java:1763) ~[?:?]
@ -85,14 +110,10 @@ public class DocumentAnalyzer {
* at org.apache.pdfbox.cos.COSDictionary.getDate(COSDictionary.java:790) ~[pdfbox-2.0.12.jar:2.0.12]
* at org.apache.pdfbox.pdmodel.PDDocumentInformation.getCreationDate(PDDocumentInformation.java:212) ~[pdfbox-2.0.12.jar:2.0.12]
*/
logger.log(Level.SEVERE, e.getMessage(), e);
logger.log(Level.WARNING, e.getMessage(), e);
}
}
public Map<String, Object> getResult() {
return result;
}
public Map<String, Object> analyze(int i, PDPage page) throws IOException {
Map<String, Object> m = new LinkedHashMap<>();
m.put("page", i);
@ -101,42 +122,46 @@ public class DocumentAnalyzer {
m.put("mediabox", Map.of("height", page.getMediaBox().getHeight(), "width", page.getMediaBox().getWidth()));
m.put("bleedbox", Map.of("height", page.getBleedBox().getHeight(), "width", page.getBleedBox().getWidth()));
m.put("rotation", page.getRotation());
ImageGraphicsExtractor extractor = new ImageGraphicsExtractor(page);
extractor.process();
m.put("images", extractor.getList());
List<Map<String, Object>> fonts = new ArrayList<>();
PDResources res = page.getResources();
for (COSName cosName : res.getFontNames()) {
PDFont font = res.getFont(cosName);
if (font != null) {
Map<String, Object> f = new LinkedHashMap<>();
f.put("name", font.getName());
f.put("damaged", font.isDamaged());
f.put("embedded", font.isEmbedded());
f.put("type", font.getType());
f.put("subtype", font.getSubType());
fonts.add(f);
}
}
m.put("fonts", fonts);
PageExtractor pageExtractor = new PageExtractor(page);
pageExtractor.process();
m.put("images", pageExtractor.getImages());
m.put("iscolor", pageExtractor.isColor());
m.put("isgray", pageExtractor.isGray());
FontExtractor fontExtractor = new FontExtractor(page);
fontExtractor.process();
m.put("fonts", fontExtractor.getFonts());
return m;
}
class ImageGraphicsExtractor extends PDFGraphicsStreamEngine {
class PageExtractor extends PDFGraphicsStreamEngine {
private final List<Map<String, Object>> list;
private final List<Map<String, Object>> images;
protected ImageGraphicsExtractor(PDPage page) {
private boolean isColor;
private boolean isGray;
protected PageExtractor(PDPage page) {
super(page);
this.list = new ArrayList<>();
this.images = new ArrayList<>();
this.isColor = false;
this.isGray = false;
}
public void process() throws IOException {
processPage(getPage());
}
public List<Map<String, Object>> getList() {
return list;
public List<Map<String, Object>> getImages() {
return images;
}
public boolean isColor() {
return isColor;
}
public boolean isGray() {
return isGray;
}
@Override
@ -151,15 +176,26 @@ public class DocumentAnalyzer {
return;
}
seen.add(xobject.getCOSObject());
Map<String, Object> m = new LinkedHashMap<>();
Map<String, Object> m = new LinkedHashMap<>();
String colorSpaceName = xobject.getColorSpace().getName();
boolean isColorSpace = isColorSpaceName(colorSpaceName);
if (isColorSpace) {
this.isColor = true;
}
boolean isGraySpace = isGraySpaceName(colorSpaceName);
if (isGraySpace) {
if (xobject.getBitsPerComponent() > 1) {
this.isGray = true;
}
}
m.put("width", xobject.getWidth());
m.put("height", xobject.getHeight());
m.put("bitspercomponent", xobject.getBitsPerComponent());
m.put("colorspace", xobject.getColorSpace().getName());
m.put("iscolor", PDDeviceRGB.INSTANCE.getName().equals(xobject.getColorSpace().getName()));
m.put("isgray", PDDeviceGray.INSTANCE.getName().equals(xobject.getColorSpace().getName()));
m.put("colorspace", colorSpaceName);
m.put("suffix", xobject.getSuffix());
list.add(m);
m.put("iscolor", isColorSpace);
m.put("isgray", isGraySpace);
images.add(m);
}
}
@ -169,12 +205,10 @@ public class DocumentAnalyzer {
@Override
public void moveTo(float x, float y) throws IOException {
}
@Override
public void lineTo(float x, float y) throws IOException {
}
@Override
@ -196,10 +230,26 @@ public class DocumentAnalyzer {
@Override
public void strokePath() {
String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName();
if (isColorSpaceName(colorSpaceName)) {
logger.log(Level.INFO, "strokepath: color true, " + colorSpaceName);
this.isColor = true;
}
if (isGraySpaceName(colorSpaceName)) {
this.isGray = true;
}
}
@Override
public void fillPath(int windingRule) {
String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName();
if (isColorSpaceName(colorSpaceName)) {
logger.log(Level.INFO, "fillpath: color true " + colorSpaceName);
this.isColor = true;
}
if (isGraySpaceName(colorSpaceName)) {
this.isGray = true;
}
}
@Override
@ -209,6 +259,51 @@ public class DocumentAnalyzer {
@Override
public void shadingFill(COSName shadingName) {
}
private boolean isColorSpaceName(String name) {
return "DeviceRGB".equals(name) ||
"DeviceCMYK".equals(name) ||
"Indexed".equals(name);
}
private boolean isGraySpaceName(String name) {
return "DeviceGray".equals(name);
}
}
static class FontExtractor {
private final List<Map<String, Object>> fonts;
private final PDResources res;
public FontExtractor(PDPage page) {
fonts = new ArrayList<>();
res = page.getResources();
}
public void process() {
for (COSName cosName : res.getFontNames()) {
try {
PDFont font = res.getFont(cosName);
if (font != null) {
Map<String, Object> f = new LinkedHashMap<>();
f.put("name", font.getName());
f.put("damaged", font.isDamaged());
f.put("embedded", font.isEmbedded());
f.put("type", font.getType());
f.put("subtype", font.getSubType());
fonts.add(f);
}
} catch (IOException e) {
logger.log(Level.WARNING, e.getMessage(), e);
}
}
}
public List<Map<String, Object>> getFonts() {
return fonts;
}
}
}

View file

@ -18,13 +18,16 @@ public class DocumentAnalyzerTest {
@Test
public void testDocument() throws IOException {
Path tmp = Files.createTempDirectory("document-analyzer");
Path path = tmp.resolve("antonio_sample.pdf");
try (InputStream inputStream = getClass().getResourceAsStream("antonio_sample.pdf");
String sample = "antonio_sample.pdf";
Path path = tmp.resolve(sample);
try (InputStream inputStream = getClass().getResourceAsStream(sample);
OutputStream outputStream = Files.newOutputStream(path)) {
if (inputStream != null) {
inputStream.transferTo(outputStream);
DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer(path.toFile());
logger.log(Level.INFO, "result = " + documentAnalyzer.getResult());
logger.log(Level.INFO, "iscolor = " + documentAnalyzer.isColor());
logger.log(Level.INFO, "isgray = " + documentAnalyzer.isGray());
}
}
Files.delete(path);

View file

@ -6,3 +6,4 @@ java.util.logging.ConsoleHandler.formatter=java.util.logging.SimpleFormatter
org.apache.fontbox.ttf.level=OFF
org.apache.fontbox.util.autodetect.FontFileFinder.level=OFF
org.apache.pdfbox.pdmodel.font.FileSystemFontProvider.level=OFF
org.apache.pdfbox.contentstream.operator.graphics.level=OFF