add color/gray detection in document analyzer
This commit is contained in:
parent
319dd41658
commit
b08fa2d137
5 changed files with 150 additions and 50 deletions
|
@ -1,5 +1,5 @@
|
||||||
group = org.xbib.graphics
|
group = org.xbib.graphics
|
||||||
name = graphics
|
name = graphics
|
||||||
version = 5.0.1
|
version = 5.0.2
|
||||||
|
|
||||||
org.gradle.warning.mode = ALL
|
org.gradle.warning.mode = ALL
|
||||||
|
|
|
@ -13,6 +13,7 @@ test {
|
||||||
file('/var/tmp/gs').mkdirs()
|
file('/var/tmp/gs').mkdirs()
|
||||||
systemProperty 'java.awt.headless', 'true'
|
systemProperty 'java.awt.headless', 'true'
|
||||||
systemProperty 'java.io.tmpdir', '/var/tmp/'
|
systemProperty 'java.io.tmpdir', '/var/tmp/'
|
||||||
|
systemProperty 'pdfbox.fontcache', '/var/tmp/pdfbox'
|
||||||
systemProperty 'jna.tmpdir', '/var/tmp/'
|
systemProperty 'jna.tmpdir', '/var/tmp/'
|
||||||
systemProperty 'jna.debug', 'true'
|
systemProperty 'jna.debug', 'true'
|
||||||
systemProperty 'java.util.logging.config.file', 'src/test/resources/logging.properties'
|
systemProperty 'java.util.logging.config.file', 'src/test/resources/logging.properties'
|
||||||
|
|
|
@ -5,11 +5,10 @@ import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
|
||||||
import org.apache.pdfbox.cos.COSName;
|
import org.apache.pdfbox.cos.COSName;
|
||||||
import org.apache.pdfbox.cos.COSStream;
|
import org.apache.pdfbox.cos.COSStream;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.PDResources;
|
import org.apache.pdfbox.pdmodel.PDResources;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
|
||||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
|
||||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
|
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
|
||||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||||
|
|
||||||
|
@ -41,38 +40,64 @@ public class DocumentAnalyzer {
|
||||||
List<Map<String, Object>> pages = new ArrayList<>();
|
List<Map<String, Object>> pages = new ArrayList<>();
|
||||||
int imagecount = 0;
|
int imagecount = 0;
|
||||||
int pagecount = document.getNumberOfPages();
|
int pagecount = document.getNumberOfPages();
|
||||||
|
boolean isDocumentColor = false;
|
||||||
|
boolean isDocumentGray = false;
|
||||||
for (int i = 0; i < pagecount; i++) {
|
for (int i = 0; i < pagecount; i++) {
|
||||||
PDPage pdPage = document.getPage(i);
|
PDPage pdPage = document.getPage(i);
|
||||||
Map<String, Object> pageMap = analyze(i, pdPage);
|
Map<String, Object> pageMap = analyze(i, pdPage);
|
||||||
|
boolean isColor = (boolean) pageMap.get("iscolor");
|
||||||
|
if (isColor) {
|
||||||
|
isDocumentColor = true;
|
||||||
|
}
|
||||||
|
boolean isGray = (boolean) pageMap.get("isgray");
|
||||||
|
if (isGray) {
|
||||||
|
isDocumentGray = true;
|
||||||
|
}
|
||||||
List<Map<String, Object>> list = (List<Map<String, Object>>) pageMap.get("images");
|
List<Map<String, Object>> list = (List<Map<String, Object>>) pageMap.get("images");
|
||||||
imagecount += list.size();
|
imagecount += list.size();
|
||||||
pages.add(pageMap);
|
pages.add(pageMap);
|
||||||
}
|
}
|
||||||
result.put("pages", pages);
|
|
||||||
result.put("imagecount", imagecount);
|
result.put("imagecount", imagecount);
|
||||||
|
result.put("iscolor", isDocumentColor);
|
||||||
|
result.put("isgray", isDocumentGray);
|
||||||
|
result.put("pages", pages);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.log(Level.SEVERE, e.getMessage(), e);
|
logger.log(Level.WARNING, e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Map<String, Object> getResult() {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isColor() {
|
||||||
|
return (boolean) result.get("iscolor");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isGray() {
|
||||||
|
return (boolean) result.get("isgray");
|
||||||
|
}
|
||||||
|
|
||||||
private void documentToResult(PDDocument document) {
|
private void documentToResult(PDDocument document) {
|
||||||
try {
|
try {
|
||||||
result.put("author", document.getDocumentInformation().getAuthor());
|
PDDocumentInformation documentInformation = document.getDocumentInformation();
|
||||||
result.put("creator", document.getDocumentInformation().getCreator());
|
result.put("author", documentInformation.getAuthor());
|
||||||
result.put("producer", document.getDocumentInformation().getProducer());
|
result.put("creator", documentInformation.getCreator());
|
||||||
result.put("title", document.getDocumentInformation().getTitle());
|
result.put("producer", documentInformation.getProducer());
|
||||||
|
result.put("title", documentInformation.getTitle());
|
||||||
result.put("pagecount", document.getNumberOfPages());
|
result.put("pagecount", document.getNumberOfPages());
|
||||||
Calendar calendar = document.getDocumentInformation().getCreationDate();
|
Calendar calendar = documentInformation.getCreationDate();
|
||||||
if (calendar != null) {
|
if (calendar != null) {
|
||||||
result.put("creationDate", calendar.toInstant());
|
result.put("creationDate", calendar.toInstant());
|
||||||
}
|
}
|
||||||
calendar = document.getDocumentInformation().getModificationDate();
|
calendar = documentInformation.getModificationDate();
|
||||||
if (calendar != null) {
|
if (calendar != null) {
|
||||||
result.put("modificationDate", calendar.toInstant());
|
result.put("modificationDate", calendar.toInstant());
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// NPE if creation/modification dates are borked
|
// NPE if creation/modification dates are borked
|
||||||
/**
|
/*
|
||||||
* java.lang.NullPointerException: null
|
* java.lang.NullPointerException: null
|
||||||
* at java.text.SimpleDateFormat.matchZoneString(SimpleDateFormat.java:1695) ~[?:?]
|
* at java.text.SimpleDateFormat.matchZoneString(SimpleDateFormat.java:1695) ~[?:?]
|
||||||
* at java.text.SimpleDateFormat.subParseZoneString(SimpleDateFormat.java:1763) ~[?:?]
|
* at java.text.SimpleDateFormat.subParseZoneString(SimpleDateFormat.java:1763) ~[?:?]
|
||||||
|
@ -85,14 +110,10 @@ public class DocumentAnalyzer {
|
||||||
* at org.apache.pdfbox.cos.COSDictionary.getDate(COSDictionary.java:790) ~[pdfbox-2.0.12.jar:2.0.12]
|
* at org.apache.pdfbox.cos.COSDictionary.getDate(COSDictionary.java:790) ~[pdfbox-2.0.12.jar:2.0.12]
|
||||||
* at org.apache.pdfbox.pdmodel.PDDocumentInformation.getCreationDate(PDDocumentInformation.java:212) ~[pdfbox-2.0.12.jar:2.0.12]
|
* at org.apache.pdfbox.pdmodel.PDDocumentInformation.getCreationDate(PDDocumentInformation.java:212) ~[pdfbox-2.0.12.jar:2.0.12]
|
||||||
*/
|
*/
|
||||||
logger.log(Level.SEVERE, e.getMessage(), e);
|
logger.log(Level.WARNING, e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, Object> getResult() {
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Map<String, Object> analyze(int i, PDPage page) throws IOException {
|
public Map<String, Object> analyze(int i, PDPage page) throws IOException {
|
||||||
Map<String, Object> m = new LinkedHashMap<>();
|
Map<String, Object> m = new LinkedHashMap<>();
|
||||||
m.put("page", i);
|
m.put("page", i);
|
||||||
|
@ -101,42 +122,46 @@ public class DocumentAnalyzer {
|
||||||
m.put("mediabox", Map.of("height", page.getMediaBox().getHeight(), "width", page.getMediaBox().getWidth()));
|
m.put("mediabox", Map.of("height", page.getMediaBox().getHeight(), "width", page.getMediaBox().getWidth()));
|
||||||
m.put("bleedbox", Map.of("height", page.getBleedBox().getHeight(), "width", page.getBleedBox().getWidth()));
|
m.put("bleedbox", Map.of("height", page.getBleedBox().getHeight(), "width", page.getBleedBox().getWidth()));
|
||||||
m.put("rotation", page.getRotation());
|
m.put("rotation", page.getRotation());
|
||||||
ImageGraphicsExtractor extractor = new ImageGraphicsExtractor(page);
|
PageExtractor pageExtractor = new PageExtractor(page);
|
||||||
extractor.process();
|
pageExtractor.process();
|
||||||
m.put("images", extractor.getList());
|
m.put("images", pageExtractor.getImages());
|
||||||
List<Map<String, Object>> fonts = new ArrayList<>();
|
m.put("iscolor", pageExtractor.isColor());
|
||||||
PDResources res = page.getResources();
|
m.put("isgray", pageExtractor.isGray());
|
||||||
for (COSName cosName : res.getFontNames()) {
|
FontExtractor fontExtractor = new FontExtractor(page);
|
||||||
PDFont font = res.getFont(cosName);
|
fontExtractor.process();
|
||||||
if (font != null) {
|
m.put("fonts", fontExtractor.getFonts());
|
||||||
Map<String, Object> f = new LinkedHashMap<>();
|
|
||||||
f.put("name", font.getName());
|
|
||||||
f.put("damaged", font.isDamaged());
|
|
||||||
f.put("embedded", font.isEmbedded());
|
|
||||||
f.put("type", font.getType());
|
|
||||||
f.put("subtype", font.getSubType());
|
|
||||||
fonts.add(f);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
m.put("fonts", fonts);
|
|
||||||
return m;
|
return m;
|
||||||
}
|
}
|
||||||
|
|
||||||
class ImageGraphicsExtractor extends PDFGraphicsStreamEngine {
|
class PageExtractor extends PDFGraphicsStreamEngine {
|
||||||
|
|
||||||
private final List<Map<String, Object>> list;
|
private final List<Map<String, Object>> images;
|
||||||
|
|
||||||
protected ImageGraphicsExtractor(PDPage page) {
|
private boolean isColor;
|
||||||
|
|
||||||
|
private boolean isGray;
|
||||||
|
|
||||||
|
protected PageExtractor(PDPage page) {
|
||||||
super(page);
|
super(page);
|
||||||
this.list = new ArrayList<>();
|
this.images = new ArrayList<>();
|
||||||
|
this.isColor = false;
|
||||||
|
this.isGray = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void process() throws IOException {
|
public void process() throws IOException {
|
||||||
processPage(getPage());
|
processPage(getPage());
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Map<String, Object>> getList() {
|
public List<Map<String, Object>> getImages() {
|
||||||
return list;
|
return images;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isColor() {
|
||||||
|
return isColor;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isGray() {
|
||||||
|
return isGray;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -152,14 +177,25 @@ public class DocumentAnalyzer {
|
||||||
}
|
}
|
||||||
seen.add(xobject.getCOSObject());
|
seen.add(xobject.getCOSObject());
|
||||||
Map<String, Object> m = new LinkedHashMap<>();
|
Map<String, Object> m = new LinkedHashMap<>();
|
||||||
|
String colorSpaceName = xobject.getColorSpace().getName();
|
||||||
|
boolean isColorSpace = isColorSpaceName(colorSpaceName);
|
||||||
|
if (isColorSpace) {
|
||||||
|
this.isColor = true;
|
||||||
|
}
|
||||||
|
boolean isGraySpace = isGraySpaceName(colorSpaceName);
|
||||||
|
if (isGraySpace) {
|
||||||
|
if (xobject.getBitsPerComponent() > 1) {
|
||||||
|
this.isGray = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
m.put("width", xobject.getWidth());
|
m.put("width", xobject.getWidth());
|
||||||
m.put("height", xobject.getHeight());
|
m.put("height", xobject.getHeight());
|
||||||
m.put("bitspercomponent", xobject.getBitsPerComponent());
|
m.put("bitspercomponent", xobject.getBitsPerComponent());
|
||||||
m.put("colorspace", xobject.getColorSpace().getName());
|
m.put("colorspace", colorSpaceName);
|
||||||
m.put("iscolor", PDDeviceRGB.INSTANCE.getName().equals(xobject.getColorSpace().getName()));
|
|
||||||
m.put("isgray", PDDeviceGray.INSTANCE.getName().equals(xobject.getColorSpace().getName()));
|
|
||||||
m.put("suffix", xobject.getSuffix());
|
m.put("suffix", xobject.getSuffix());
|
||||||
list.add(m);
|
m.put("iscolor", isColorSpace);
|
||||||
|
m.put("isgray", isGraySpace);
|
||||||
|
images.add(m);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -169,12 +205,10 @@ public class DocumentAnalyzer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void moveTo(float x, float y) throws IOException {
|
public void moveTo(float x, float y) throws IOException {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void lineTo(float x, float y) throws IOException {
|
public void lineTo(float x, float y) throws IOException {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -196,10 +230,26 @@ public class DocumentAnalyzer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void strokePath() {
|
public void strokePath() {
|
||||||
|
String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName();
|
||||||
|
if (isColorSpaceName(colorSpaceName)) {
|
||||||
|
logger.log(Level.INFO, "strokepath: color true, " + colorSpaceName);
|
||||||
|
this.isColor = true;
|
||||||
|
}
|
||||||
|
if (isGraySpaceName(colorSpaceName)) {
|
||||||
|
this.isGray = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void fillPath(int windingRule) {
|
public void fillPath(int windingRule) {
|
||||||
|
String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName();
|
||||||
|
if (isColorSpaceName(colorSpaceName)) {
|
||||||
|
logger.log(Level.INFO, "fillpath: color true " + colorSpaceName);
|
||||||
|
this.isColor = true;
|
||||||
|
}
|
||||||
|
if (isGraySpaceName(colorSpaceName)) {
|
||||||
|
this.isGray = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -209,6 +259,51 @@ public class DocumentAnalyzer {
|
||||||
@Override
|
@Override
|
||||||
public void shadingFill(COSName shadingName) {
|
public void shadingFill(COSName shadingName) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isColorSpaceName(String name) {
|
||||||
|
return "DeviceRGB".equals(name) ||
|
||||||
|
"DeviceCMYK".equals(name) ||
|
||||||
|
"Indexed".equals(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isGraySpaceName(String name) {
|
||||||
|
return "DeviceGray".equals(name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class FontExtractor {
|
||||||
|
|
||||||
|
private final List<Map<String, Object>> fonts;
|
||||||
|
|
||||||
|
private final PDResources res;
|
||||||
|
|
||||||
|
public FontExtractor(PDPage page) {
|
||||||
|
fonts = new ArrayList<>();
|
||||||
|
res = page.getResources();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void process() {
|
||||||
|
for (COSName cosName : res.getFontNames()) {
|
||||||
|
try {
|
||||||
|
PDFont font = res.getFont(cosName);
|
||||||
|
if (font != null) {
|
||||||
|
Map<String, Object> f = new LinkedHashMap<>();
|
||||||
|
f.put("name", font.getName());
|
||||||
|
f.put("damaged", font.isDamaged());
|
||||||
|
f.put("embedded", font.isEmbedded());
|
||||||
|
f.put("type", font.getType());
|
||||||
|
f.put("subtype", font.getSubType());
|
||||||
|
fonts.add(f);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.log(Level.WARNING, e.getMessage(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Map<String, Object>> getFonts() {
|
||||||
|
return fonts;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,13 +18,16 @@ public class DocumentAnalyzerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testDocument() throws IOException {
|
public void testDocument() throws IOException {
|
||||||
Path tmp = Files.createTempDirectory("document-analyzer");
|
Path tmp = Files.createTempDirectory("document-analyzer");
|
||||||
Path path = tmp.resolve("antonio_sample.pdf");
|
String sample = "antonio_sample.pdf";
|
||||||
try (InputStream inputStream = getClass().getResourceAsStream("antonio_sample.pdf");
|
Path path = tmp.resolve(sample);
|
||||||
|
try (InputStream inputStream = getClass().getResourceAsStream(sample);
|
||||||
OutputStream outputStream = Files.newOutputStream(path)) {
|
OutputStream outputStream = Files.newOutputStream(path)) {
|
||||||
if (inputStream != null) {
|
if (inputStream != null) {
|
||||||
inputStream.transferTo(outputStream);
|
inputStream.transferTo(outputStream);
|
||||||
DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer(path.toFile());
|
DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer(path.toFile());
|
||||||
logger.log(Level.INFO, "result = " + documentAnalyzer.getResult());
|
logger.log(Level.INFO, "result = " + documentAnalyzer.getResult());
|
||||||
|
logger.log(Level.INFO, "iscolor = " + documentAnalyzer.isColor());
|
||||||
|
logger.log(Level.INFO, "isgray = " + documentAnalyzer.isGray());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Files.delete(path);
|
Files.delete(path);
|
||||||
|
|
|
@ -6,3 +6,4 @@ java.util.logging.ConsoleHandler.formatter=java.util.logging.SimpleFormatter
|
||||||
org.apache.fontbox.ttf.level=OFF
|
org.apache.fontbox.ttf.level=OFF
|
||||||
org.apache.fontbox.util.autodetect.FontFileFinder.level=OFF
|
org.apache.fontbox.util.autodetect.FontFileFinder.level=OFF
|
||||||
org.apache.pdfbox.pdmodel.font.FileSystemFontProvider.level=OFF
|
org.apache.pdfbox.pdmodel.font.FileSystemFontProvider.level=OFF
|
||||||
|
org.apache.pdfbox.contentstream.operator.graphics.level=OFF
|
||||||
|
|
Loading…
Reference in a new issue