add color/gray detection in document analyzer
This commit is contained in:
parent
319dd41658
commit
b08fa2d137
5 changed files with 150 additions and 50 deletions
|
@ -1,5 +1,5 @@
|
|||
group = org.xbib.graphics
|
||||
name = graphics
|
||||
version = 5.0.1
|
||||
version = 5.0.2
|
||||
|
||||
org.gradle.warning.mode = ALL
|
||||
|
|
|
@ -13,6 +13,7 @@ test {
|
|||
file('/var/tmp/gs').mkdirs()
|
||||
systemProperty 'java.awt.headless', 'true'
|
||||
systemProperty 'java.io.tmpdir', '/var/tmp/'
|
||||
systemProperty 'pdfbox.fontcache', '/var/tmp/pdfbox'
|
||||
systemProperty 'jna.tmpdir', '/var/tmp/'
|
||||
systemProperty 'jna.debug', 'true'
|
||||
systemProperty 'java.util.logging.config.file', 'src/test/resources/logging.properties'
|
||||
|
|
|
@ -5,11 +5,10 @@ import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
|
|||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSStream;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
|
||||
|
@ -41,38 +40,64 @@ public class DocumentAnalyzer {
|
|||
List<Map<String, Object>> pages = new ArrayList<>();
|
||||
int imagecount = 0;
|
||||
int pagecount = document.getNumberOfPages();
|
||||
boolean isDocumentColor = false;
|
||||
boolean isDocumentGray = false;
|
||||
for (int i = 0; i < pagecount; i++) {
|
||||
PDPage pdPage = document.getPage(i);
|
||||
Map<String, Object> pageMap = analyze(i, pdPage);
|
||||
boolean isColor = (boolean) pageMap.get("iscolor");
|
||||
if (isColor) {
|
||||
isDocumentColor = true;
|
||||
}
|
||||
boolean isGray = (boolean) pageMap.get("isgray");
|
||||
if (isGray) {
|
||||
isDocumentGray = true;
|
||||
}
|
||||
List<Map<String, Object>> list = (List<Map<String, Object>>) pageMap.get("images");
|
||||
imagecount += list.size();
|
||||
pages.add(pageMap);
|
||||
}
|
||||
result.put("pages", pages);
|
||||
result.put("imagecount", imagecount);
|
||||
result.put("iscolor", isDocumentColor);
|
||||
result.put("isgray", isDocumentGray);
|
||||
result.put("pages", pages);
|
||||
} catch (Exception e) {
|
||||
logger.log(Level.SEVERE, e.getMessage(), e);
|
||||
logger.log(Level.WARNING, e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public Map<String, Object> getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean isColor() {
|
||||
return (boolean) result.get("iscolor");
|
||||
}
|
||||
|
||||
|
||||
public boolean isGray() {
|
||||
return (boolean) result.get("isgray");
|
||||
}
|
||||
|
||||
private void documentToResult(PDDocument document) {
|
||||
try {
|
||||
result.put("author", document.getDocumentInformation().getAuthor());
|
||||
result.put("creator", document.getDocumentInformation().getCreator());
|
||||
result.put("producer", document.getDocumentInformation().getProducer());
|
||||
result.put("title", document.getDocumentInformation().getTitle());
|
||||
PDDocumentInformation documentInformation = document.getDocumentInformation();
|
||||
result.put("author", documentInformation.getAuthor());
|
||||
result.put("creator", documentInformation.getCreator());
|
||||
result.put("producer", documentInformation.getProducer());
|
||||
result.put("title", documentInformation.getTitle());
|
||||
result.put("pagecount", document.getNumberOfPages());
|
||||
Calendar calendar = document.getDocumentInformation().getCreationDate();
|
||||
Calendar calendar = documentInformation.getCreationDate();
|
||||
if (calendar != null) {
|
||||
result.put("creationDate", calendar.toInstant());
|
||||
}
|
||||
calendar = document.getDocumentInformation().getModificationDate();
|
||||
calendar = documentInformation.getModificationDate();
|
||||
if (calendar != null) {
|
||||
result.put("modificationDate", calendar.toInstant());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// NPE if creation/modification dates are borked
|
||||
/**
|
||||
/*
|
||||
* java.lang.NullPointerException: null
|
||||
* at java.text.SimpleDateFormat.matchZoneString(SimpleDateFormat.java:1695) ~[?:?]
|
||||
* at java.text.SimpleDateFormat.subParseZoneString(SimpleDateFormat.java:1763) ~[?:?]
|
||||
|
@ -85,14 +110,10 @@ public class DocumentAnalyzer {
|
|||
* at org.apache.pdfbox.cos.COSDictionary.getDate(COSDictionary.java:790) ~[pdfbox-2.0.12.jar:2.0.12]
|
||||
* at org.apache.pdfbox.pdmodel.PDDocumentInformation.getCreationDate(PDDocumentInformation.java:212) ~[pdfbox-2.0.12.jar:2.0.12]
|
||||
*/
|
||||
logger.log(Level.SEVERE, e.getMessage(), e);
|
||||
logger.log(Level.WARNING, e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public Map<String, Object> getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public Map<String, Object> analyze(int i, PDPage page) throws IOException {
|
||||
Map<String, Object> m = new LinkedHashMap<>();
|
||||
m.put("page", i);
|
||||
|
@ -101,42 +122,46 @@ public class DocumentAnalyzer {
|
|||
m.put("mediabox", Map.of("height", page.getMediaBox().getHeight(), "width", page.getMediaBox().getWidth()));
|
||||
m.put("bleedbox", Map.of("height", page.getBleedBox().getHeight(), "width", page.getBleedBox().getWidth()));
|
||||
m.put("rotation", page.getRotation());
|
||||
ImageGraphicsExtractor extractor = new ImageGraphicsExtractor(page);
|
||||
extractor.process();
|
||||
m.put("images", extractor.getList());
|
||||
List<Map<String, Object>> fonts = new ArrayList<>();
|
||||
PDResources res = page.getResources();
|
||||
for (COSName cosName : res.getFontNames()) {
|
||||
PDFont font = res.getFont(cosName);
|
||||
if (font != null) {
|
||||
Map<String, Object> f = new LinkedHashMap<>();
|
||||
f.put("name", font.getName());
|
||||
f.put("damaged", font.isDamaged());
|
||||
f.put("embedded", font.isEmbedded());
|
||||
f.put("type", font.getType());
|
||||
f.put("subtype", font.getSubType());
|
||||
fonts.add(f);
|
||||
}
|
||||
}
|
||||
m.put("fonts", fonts);
|
||||
PageExtractor pageExtractor = new PageExtractor(page);
|
||||
pageExtractor.process();
|
||||
m.put("images", pageExtractor.getImages());
|
||||
m.put("iscolor", pageExtractor.isColor());
|
||||
m.put("isgray", pageExtractor.isGray());
|
||||
FontExtractor fontExtractor = new FontExtractor(page);
|
||||
fontExtractor.process();
|
||||
m.put("fonts", fontExtractor.getFonts());
|
||||
return m;
|
||||
}
|
||||
|
||||
class ImageGraphicsExtractor extends PDFGraphicsStreamEngine {
|
||||
class PageExtractor extends PDFGraphicsStreamEngine {
|
||||
|
||||
private final List<Map<String, Object>> list;
|
||||
private final List<Map<String, Object>> images;
|
||||
|
||||
protected ImageGraphicsExtractor(PDPage page) {
|
||||
private boolean isColor;
|
||||
|
||||
private boolean isGray;
|
||||
|
||||
protected PageExtractor(PDPage page) {
|
||||
super(page);
|
||||
this.list = new ArrayList<>();
|
||||
this.images = new ArrayList<>();
|
||||
this.isColor = false;
|
||||
this.isGray = false;
|
||||
}
|
||||
|
||||
public void process() throws IOException {
|
||||
processPage(getPage());
|
||||
}
|
||||
|
||||
public List<Map<String, Object>> getList() {
|
||||
return list;
|
||||
public List<Map<String, Object>> getImages() {
|
||||
return images;
|
||||
}
|
||||
|
||||
public boolean isColor() {
|
||||
return isColor;
|
||||
}
|
||||
|
||||
public boolean isGray() {
|
||||
return isGray;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -151,15 +176,26 @@ public class DocumentAnalyzer {
|
|||
return;
|
||||
}
|
||||
seen.add(xobject.getCOSObject());
|
||||
Map<String, Object> m = new LinkedHashMap<>();
|
||||
Map<String, Object> m = new LinkedHashMap<>();
|
||||
String colorSpaceName = xobject.getColorSpace().getName();
|
||||
boolean isColorSpace = isColorSpaceName(colorSpaceName);
|
||||
if (isColorSpace) {
|
||||
this.isColor = true;
|
||||
}
|
||||
boolean isGraySpace = isGraySpaceName(colorSpaceName);
|
||||
if (isGraySpace) {
|
||||
if (xobject.getBitsPerComponent() > 1) {
|
||||
this.isGray = true;
|
||||
}
|
||||
}
|
||||
m.put("width", xobject.getWidth());
|
||||
m.put("height", xobject.getHeight());
|
||||
m.put("bitspercomponent", xobject.getBitsPerComponent());
|
||||
m.put("colorspace", xobject.getColorSpace().getName());
|
||||
m.put("iscolor", PDDeviceRGB.INSTANCE.getName().equals(xobject.getColorSpace().getName()));
|
||||
m.put("isgray", PDDeviceGray.INSTANCE.getName().equals(xobject.getColorSpace().getName()));
|
||||
m.put("colorspace", colorSpaceName);
|
||||
m.put("suffix", xobject.getSuffix());
|
||||
list.add(m);
|
||||
m.put("iscolor", isColorSpace);
|
||||
m.put("isgray", isGraySpace);
|
||||
images.add(m);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -169,12 +205,10 @@ public class DocumentAnalyzer {
|
|||
|
||||
@Override
|
||||
public void moveTo(float x, float y) throws IOException {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void lineTo(float x, float y) throws IOException {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -196,10 +230,26 @@ public class DocumentAnalyzer {
|
|||
|
||||
@Override
|
||||
public void strokePath() {
|
||||
String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName();
|
||||
if (isColorSpaceName(colorSpaceName)) {
|
||||
logger.log(Level.INFO, "strokepath: color true, " + colorSpaceName);
|
||||
this.isColor = true;
|
||||
}
|
||||
if (isGraySpaceName(colorSpaceName)) {
|
||||
this.isGray = true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fillPath(int windingRule) {
|
||||
String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName();
|
||||
if (isColorSpaceName(colorSpaceName)) {
|
||||
logger.log(Level.INFO, "fillpath: color true " + colorSpaceName);
|
||||
this.isColor = true;
|
||||
}
|
||||
if (isGraySpaceName(colorSpaceName)) {
|
||||
this.isGray = true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -209,6 +259,51 @@ public class DocumentAnalyzer {
|
|||
@Override
|
||||
public void shadingFill(COSName shadingName) {
|
||||
}
|
||||
|
||||
private boolean isColorSpaceName(String name) {
|
||||
return "DeviceRGB".equals(name) ||
|
||||
"DeviceCMYK".equals(name) ||
|
||||
"Indexed".equals(name);
|
||||
}
|
||||
|
||||
private boolean isGraySpaceName(String name) {
|
||||
return "DeviceGray".equals(name);
|
||||
}
|
||||
}
|
||||
|
||||
static class FontExtractor {
|
||||
|
||||
private final List<Map<String, Object>> fonts;
|
||||
|
||||
private final PDResources res;
|
||||
|
||||
public FontExtractor(PDPage page) {
|
||||
fonts = new ArrayList<>();
|
||||
res = page.getResources();
|
||||
}
|
||||
|
||||
public void process() {
|
||||
for (COSName cosName : res.getFontNames()) {
|
||||
try {
|
||||
PDFont font = res.getFont(cosName);
|
||||
if (font != null) {
|
||||
Map<String, Object> f = new LinkedHashMap<>();
|
||||
f.put("name", font.getName());
|
||||
f.put("damaged", font.isDamaged());
|
||||
f.put("embedded", font.isEmbedded());
|
||||
f.put("type", font.getType());
|
||||
f.put("subtype", font.getSubType());
|
||||
fonts.add(f);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.log(Level.WARNING, e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<Map<String, Object>> getFonts() {
|
||||
return fonts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,13 +18,16 @@ public class DocumentAnalyzerTest {
|
|||
@Test
|
||||
public void testDocument() throws IOException {
|
||||
Path tmp = Files.createTempDirectory("document-analyzer");
|
||||
Path path = tmp.resolve("antonio_sample.pdf");
|
||||
try (InputStream inputStream = getClass().getResourceAsStream("antonio_sample.pdf");
|
||||
String sample = "antonio_sample.pdf";
|
||||
Path path = tmp.resolve(sample);
|
||||
try (InputStream inputStream = getClass().getResourceAsStream(sample);
|
||||
OutputStream outputStream = Files.newOutputStream(path)) {
|
||||
if (inputStream != null) {
|
||||
inputStream.transferTo(outputStream);
|
||||
DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer(path.toFile());
|
||||
logger.log(Level.INFO, "result = " + documentAnalyzer.getResult());
|
||||
logger.log(Level.INFO, "iscolor = " + documentAnalyzer.isColor());
|
||||
logger.log(Level.INFO, "isgray = " + documentAnalyzer.isGray());
|
||||
}
|
||||
}
|
||||
Files.delete(path);
|
||||
|
|
|
@ -6,3 +6,4 @@ java.util.logging.ConsoleHandler.formatter=java.util.logging.SimpleFormatter
|
|||
org.apache.fontbox.ttf.level=OFF
|
||||
org.apache.fontbox.util.autodetect.FontFileFinder.level=OFF
|
||||
org.apache.pdfbox.pdmodel.font.FileSystemFontProvider.level=OFF
|
||||
org.apache.pdfbox.contentstream.operator.graphics.level=OFF
|
||||
|
|
Loading…
Reference in a new issue