enhance document analyzer
This commit is contained in:
parent
b08fa2d137
commit
67609394cc
3 changed files with 286 additions and 25 deletions
|
@ -1,5 +1,5 @@
|
|||
group = org.xbib.graphics
|
||||
name = graphics
|
||||
version = 5.0.2
|
||||
version = 5.0.3
|
||||
|
||||
org.gradle.warning.mode = ALL
|
||||
|
|
|
@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
|||
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
|
@ -19,6 +20,7 @@ import java.util.ArrayList;
|
|||
import java.util.Calendar;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
@ -29,22 +31,30 @@ public class DocumentAnalyzer {
|
|||
|
||||
private static final Logger logger = Logger.getLogger(DocumentAnalyzer.class.getName());
|
||||
|
||||
private final Map<String, Object> result = new LinkedHashMap<>();
|
||||
private final Map<String, Object> result;
|
||||
|
||||
private final Set<COSStream> seen = new HashSet<>();
|
||||
public DocumentAnalyzer() {
|
||||
result = new LinkedHashMap<>();
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public DocumentAnalyzer(File file) {
|
||||
public void process(File file) {
|
||||
result.clear();
|
||||
Set<COSStream> seen = new HashSet<>();
|
||||
try (PDDocument document = Loader.loadPDF(file)) {
|
||||
documentToResult(document);
|
||||
documentInformationToResult(document.getDocumentInformation());
|
||||
List<Map<String, Object>> pages = new ArrayList<>();
|
||||
int imagecount = 0;
|
||||
int documentimagecount = 0;
|
||||
int pagecount = document.getNumberOfPages();
|
||||
boolean isDocumentColor = false;
|
||||
boolean isDocumentGray = false;
|
||||
boolean isDocumentA4 = true;
|
||||
boolean isDocumentLetter = true;
|
||||
boolean isDocumentLandscape = false;
|
||||
boolean isDocumentImage = true;
|
||||
for (int i = 0; i < pagecount; i++) {
|
||||
PDPage pdPage = document.getPage(i);
|
||||
Map<String, Object> pageMap = analyze(i, pdPage);
|
||||
Map<String, Object> pageMap = analyzePage(i, pdPage, seen);
|
||||
boolean isColor = (boolean) pageMap.get("iscolor");
|
||||
if (isColor) {
|
||||
isDocumentColor = true;
|
||||
|
@ -53,13 +63,28 @@ public class DocumentAnalyzer {
|
|||
if (isGray) {
|
||||
isDocumentGray = true;
|
||||
}
|
||||
boolean isA4 = (boolean) pageMap.get("isa4");
|
||||
isDocumentA4 = isDocumentA4 && isA4;
|
||||
boolean isLetter = (boolean) pageMap.get("isletter");
|
||||
isDocumentLetter = isDocumentLetter && isLetter;
|
||||
boolean isLandscape = (boolean) pageMap.get("islandscape");
|
||||
if (isLandscape) {
|
||||
isDocumentLandscape = true;
|
||||
}
|
||||
List<Map<String, Object>> list = (List<Map<String, Object>>) pageMap.get("images");
|
||||
imagecount += list.size();
|
||||
int imagecount = list.size();
|
||||
documentimagecount += imagecount;
|
||||
isDocumentImage = isDocumentImage && (imagecount == 1);
|
||||
pages.add(pageMap);
|
||||
}
|
||||
result.put("imagecount", imagecount);
|
||||
result.put("pagecount", pagecount);
|
||||
result.put("imagecount", documentimagecount);
|
||||
result.put("isimage", pagecount > 0 && isDocumentImage);
|
||||
result.put("iscolor", isDocumentColor);
|
||||
result.put("isgray", isDocumentGray);
|
||||
result.put("isa4", pagecount > 0 && isDocumentA4);
|
||||
result.put("isletter", pagecount > 0 && isDocumentLetter);
|
||||
result.put("islandscape", isDocumentLandscape);
|
||||
result.put("pages", pages);
|
||||
} catch (Exception e) {
|
||||
logger.log(Level.WARNING, e.getMessage(), e);
|
||||
|
@ -70,23 +95,50 @@ public class DocumentAnalyzer {
|
|||
return result;
|
||||
}
|
||||
|
||||
public boolean isValid() {
|
||||
return !result.isEmpty();
|
||||
}
|
||||
|
||||
public int getPageCount() {
|
||||
return (int) result.get("pagecount");
|
||||
}
|
||||
|
||||
public boolean isColor() {
|
||||
return (boolean) result.get("iscolor");
|
||||
}
|
||||
|
||||
|
||||
public boolean isGray() {
|
||||
return (boolean) result.get("isgray");
|
||||
}
|
||||
|
||||
private void documentToResult(PDDocument document) {
|
||||
public boolean isA4() {
|
||||
return (boolean) result.get("isa4");
|
||||
}
|
||||
|
||||
public boolean isLetter() {
|
||||
return (boolean) result.get("isletter");
|
||||
}
|
||||
|
||||
public boolean isLandscape() {
|
||||
return (boolean) result.get("islandscape");
|
||||
}
|
||||
|
||||
public boolean isImage() {
|
||||
return (boolean) result.get("isimage");
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public Map<String, Object> getPage(int pageNumber) {
|
||||
List<Map<String, Object>> pages = (List<Map<String, Object>>) result.get("pages");
|
||||
return pages.get(pageNumber);
|
||||
}
|
||||
|
||||
private void documentInformationToResult(PDDocumentInformation documentInformation) {
|
||||
try {
|
||||
PDDocumentInformation documentInformation = document.getDocumentInformation();
|
||||
result.put("author", documentInformation.getAuthor());
|
||||
result.put("creator", documentInformation.getCreator());
|
||||
result.put("producer", documentInformation.getProducer());
|
||||
result.put("title", documentInformation.getTitle());
|
||||
result.put("pagecount", document.getNumberOfPages());
|
||||
Calendar calendar = documentInformation.getCreationDate();
|
||||
if (calendar != null) {
|
||||
result.put("creationDate", calendar.toInstant());
|
||||
|
@ -114,15 +166,16 @@ public class DocumentAnalyzer {
|
|||
}
|
||||
}
|
||||
|
||||
public Map<String, Object> analyze(int i, PDPage page) throws IOException {
|
||||
private Map<String, Object> analyzePage(int i, PDPage page, Set<COSStream> seen) throws IOException {
|
||||
Map<String, Object> m = new LinkedHashMap<>();
|
||||
m.put("page", i);
|
||||
m.put("bbox", Map.of("height", page.getBBox().getHeight(), "width", page.getBBox().getWidth()));
|
||||
m.put("cropbox", Map.of("height", page.getCropBox().getHeight(), "width", page.getCropBox().getWidth()));
|
||||
m.put("mediabox", Map.of("height", page.getMediaBox().getHeight(), "width", page.getMediaBox().getWidth()));
|
||||
m.put("bleedbox", Map.of("height", page.getBleedBox().getHeight(), "width", page.getBleedBox().getWidth()));
|
||||
checkBoxDimensions(page, m);
|
||||
m.put("rotation", page.getRotation());
|
||||
PageExtractor pageExtractor = new PageExtractor(page);
|
||||
PageExtractor pageExtractor = new PageExtractor(page, seen);
|
||||
pageExtractor.process();
|
||||
m.put("images", pageExtractor.getImages());
|
||||
m.put("iscolor", pageExtractor.isColor());
|
||||
|
@ -133,16 +186,219 @@ public class DocumentAnalyzer {
|
|||
return m;
|
||||
}
|
||||
|
||||
class PageExtractor extends PDFGraphicsStreamEngine {
|
||||
private void checkBoxDimensions(PDPage page, Map<String, Object> m) {
|
||||
List<Boolean> isA4 = new ArrayList<>();
|
||||
List<Boolean> isLetter = new ArrayList<>();
|
||||
List<Boolean> isLandscape= new ArrayList<>();
|
||||
List.of(Map.entry("bbox", page.getBBox()),
|
||||
Map.entry("cropbox", page.getCropBox()),
|
||||
Map.entry("mediabox", page.getMediaBox()),
|
||||
Map.entry("bleedbox", page.getBleedBox())).forEach(e -> {
|
||||
String boxName = e.getKey();
|
||||
PDRectangle rect = e.getValue();
|
||||
Set<String> set = new LinkedHashSet<>();
|
||||
if (isA0(rect)) {
|
||||
set.add("A0");
|
||||
isLandscape.add(false);
|
||||
} else if (isA0Landscape(rect)) {
|
||||
set.add("A0 Landscape");
|
||||
isLandscape.add(true);
|
||||
}
|
||||
if (isA1(rect)) {
|
||||
set.add("A1");
|
||||
isLandscape.add(false);
|
||||
} else if (isA1Landscape(rect)) {
|
||||
set.add("A1 Landscape");
|
||||
isLandscape.add(true);
|
||||
}
|
||||
if (isA2(rect)) {
|
||||
set.add("A2");
|
||||
isLandscape.add(false);
|
||||
} else if (isA2Landscape(rect)) {
|
||||
set.add("A2 Landscape");
|
||||
isLandscape.add(true);
|
||||
}
|
||||
if (isA3(rect)) {
|
||||
set.add("A3");
|
||||
isLandscape.add(false);
|
||||
} else if (isA3Landscape(rect)) {
|
||||
set.add("A3 Landscape");
|
||||
isLandscape.add(true);
|
||||
}
|
||||
if (isA4(rect)) {
|
||||
set.add("A4");
|
||||
isA4.add(true);
|
||||
isLandscape.add(false);
|
||||
} else if (isA4Landscape(rect)) {
|
||||
set.add("A4 Landscape");
|
||||
isA4.add(true);
|
||||
isLandscape.add(true);
|
||||
} else {
|
||||
isA4.add(false);
|
||||
}
|
||||
if (isA5(rect)) {
|
||||
set.add("A5");
|
||||
isLandscape.add(false);
|
||||
} else if (isA5Landscape(rect)) {
|
||||
set.add("A5 Landscape");
|
||||
isLandscape.add(true);
|
||||
}
|
||||
if (isA6(rect)) {
|
||||
set.add("A6");
|
||||
isLandscape.add(false);
|
||||
} else if (isA6Landscape(rect)) {
|
||||
set.add("A6 Landscape");
|
||||
isLandscape.add(true);
|
||||
}
|
||||
if (isLetter(rect)) {
|
||||
set.add("LETTER");
|
||||
isLetter.add(true);
|
||||
isLandscape.add(false);
|
||||
} else if (isLetterLandscape(rect)) {
|
||||
set.add("LETTER Landscape");
|
||||
isLetter.add(true);
|
||||
isLandscape.add(true);
|
||||
} else {
|
||||
isLetter.add(false);
|
||||
}
|
||||
if (isLegal(rect)) {
|
||||
set.add("LEGAL");
|
||||
isLandscape.add(false);
|
||||
} else if (isLegalLandscape(rect)) {
|
||||
set.add("LEGAL Landscape");
|
||||
isLandscape.add(true);
|
||||
}
|
||||
if (isTabloid(rect)) {
|
||||
set.add("TABLOID");
|
||||
isLandscape.add(false);
|
||||
} else if (isTabloidLandscape(rect)) {
|
||||
set.add("TABLOID Landscape");
|
||||
isLandscape.add(true);
|
||||
}
|
||||
m.put(boxName + "dimensions", set);
|
||||
});
|
||||
m.put("isa4", !isA4.isEmpty() && isA4.stream().allMatch(b -> b == Boolean.TRUE));
|
||||
m.put("isletter", !isLandscape.isEmpty() && isLetter.stream().allMatch(b -> b == Boolean.TRUE));
|
||||
m.put("islandscape", !isLandscape.isEmpty() && isLandscape.stream().allMatch(b -> b == Boolean.TRUE));
|
||||
}
|
||||
|
||||
private boolean isA0(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getHeight(), PDRectangle.A0.getHeight()) &&
|
||||
compareFloat(rectangle.getWidth(), PDRectangle.A0.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA0Landscape(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getWidth(), PDRectangle.A0.getHeight()) &&
|
||||
compareFloat(rectangle.getHeight(), PDRectangle.A0.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA1(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getHeight(), PDRectangle.A1.getHeight()) &&
|
||||
compareFloat(rectangle.getWidth(), PDRectangle.A1.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA1Landscape(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getWidth(), PDRectangle.A1.getHeight()) &&
|
||||
compareFloat(rectangle.getHeight(), PDRectangle.A1.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA2(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getHeight(), PDRectangle.A2.getHeight()) &&
|
||||
compareFloat(rectangle.getWidth(), PDRectangle.A2.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA2Landscape(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getWidth(), PDRectangle.A2.getHeight()) &&
|
||||
compareFloat(rectangle.getHeight(), PDRectangle.A2.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA3(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getHeight(), PDRectangle.A3.getHeight()) &&
|
||||
compareFloat(rectangle.getWidth(), PDRectangle.A3.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA3Landscape(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getWidth(), PDRectangle.A3.getHeight()) &&
|
||||
compareFloat(rectangle.getHeight(), PDRectangle.A3.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA4(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getHeight(),PDRectangle.A4.getHeight()) &&
|
||||
compareFloat(rectangle.getWidth(), PDRectangle.A4.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA4Landscape(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getWidth(),PDRectangle.A4.getHeight()) &&
|
||||
compareFloat(rectangle.getHeight(), PDRectangle.A4.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA5(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getHeight(), PDRectangle.A5.getHeight()) &&
|
||||
compareFloat(rectangle.getWidth(), PDRectangle.A5.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA5Landscape(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getWidth(), PDRectangle.A5.getHeight()) &&
|
||||
compareFloat(rectangle.getHeight(), PDRectangle.A5.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA6(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getHeight(), PDRectangle.A6.getHeight()) &&
|
||||
compareFloat(rectangle.getWidth(), PDRectangle.A6.getWidth());
|
||||
}
|
||||
|
||||
private boolean isA6Landscape(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getHeight(), PDRectangle.A6.getHeight()) &&
|
||||
compareFloat(rectangle.getWidth(), PDRectangle.A6.getWidth());
|
||||
}
|
||||
|
||||
private boolean isLetter(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getHeight(), PDRectangle.LETTER.getHeight()) &&
|
||||
compareFloat(rectangle.getWidth(), PDRectangle.LETTER.getWidth());
|
||||
}
|
||||
|
||||
private boolean isLetterLandscape(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getWidth(), PDRectangle.LETTER.getHeight()) &&
|
||||
compareFloat(rectangle.getHeight(), PDRectangle.LETTER.getWidth());
|
||||
}
|
||||
|
||||
private boolean isLegal(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getHeight(), PDRectangle.LEGAL.getHeight()) &&
|
||||
compareFloat(rectangle.getWidth(), PDRectangle.LEGAL.getWidth());
|
||||
}
|
||||
|
||||
private boolean isLegalLandscape(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getWidth(), PDRectangle.LEGAL.getHeight()) &&
|
||||
compareFloat(rectangle.getHeight(), PDRectangle.LEGAL.getWidth());
|
||||
}
|
||||
|
||||
private boolean isTabloid(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getHeight(), PDRectangle.TABLOID.getHeight()) &&
|
||||
compareFloat(rectangle.getWidth(), PDRectangle.TABLOID.getWidth());
|
||||
}
|
||||
|
||||
private boolean isTabloidLandscape(PDRectangle rectangle) {
|
||||
return compareFloat(rectangle.getWidth(), PDRectangle.TABLOID.getHeight()) &&
|
||||
compareFloat(rectangle.getHeight(), PDRectangle.TABLOID.getWidth());
|
||||
}
|
||||
|
||||
private boolean compareFloat(float f1, float f2) {
|
||||
return f1 == f2 || (Math.abs(f2 - f1) < 1.0f);
|
||||
}
|
||||
|
||||
private static class PageExtractor extends PDFGraphicsStreamEngine {
|
||||
|
||||
private final List<Map<String, Object>> images;
|
||||
|
||||
private final Set<COSStream> seen;
|
||||
|
||||
private boolean isColor;
|
||||
|
||||
private boolean isGray;
|
||||
|
||||
protected PageExtractor(PDPage page) {
|
||||
private PageExtractor(PDPage page, Set<COSStream> seen) {
|
||||
super(page);
|
||||
this.seen = seen;
|
||||
this.images = new ArrayList<>();
|
||||
this.isColor = false;
|
||||
this.isGray = false;
|
||||
|
@ -204,11 +460,11 @@ public class DocumentAnalyzer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void moveTo(float x, float y) throws IOException {
|
||||
public void moveTo(float x, float y) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void lineTo(float x, float y) throws IOException {
|
||||
public void lineTo(float x, float y) {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -232,7 +488,6 @@ public class DocumentAnalyzer {
|
|||
public void strokePath() {
|
||||
String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName();
|
||||
if (isColorSpaceName(colorSpaceName)) {
|
||||
logger.log(Level.INFO, "strokepath: color true, " + colorSpaceName);
|
||||
this.isColor = true;
|
||||
}
|
||||
if (isGraySpaceName(colorSpaceName)) {
|
||||
|
@ -244,7 +499,6 @@ public class DocumentAnalyzer {
|
|||
public void fillPath(int windingRule) {
|
||||
String colorSpaceName = getGraphicsState().getStrokingColor().getColorSpace().getName();
|
||||
if (isColorSpaceName(colorSpaceName)) {
|
||||
logger.log(Level.INFO, "fillpath: color true " + colorSpaceName);
|
||||
this.isColor = true;
|
||||
}
|
||||
if (isGraySpaceName(colorSpaceName)) {
|
||||
|
@ -263,6 +517,7 @@ public class DocumentAnalyzer {
|
|||
private boolean isColorSpaceName(String name) {
|
||||
return "DeviceRGB".equals(name) ||
|
||||
"DeviceCMYK".equals(name) ||
|
||||
"ICCBased".equals(name) ||
|
||||
"Indexed".equals(name);
|
||||
}
|
||||
|
||||
|
@ -271,13 +526,13 @@ public class DocumentAnalyzer {
|
|||
}
|
||||
}
|
||||
|
||||
static class FontExtractor {
|
||||
private static class FontExtractor {
|
||||
|
||||
private final List<Map<String, Object>> fonts;
|
||||
|
||||
private final PDResources res;
|
||||
|
||||
public FontExtractor(PDPage page) {
|
||||
private FontExtractor(PDPage page) {
|
||||
fonts = new ArrayList<>();
|
||||
res = page.getResources();
|
||||
}
|
||||
|
|
|
@ -18,16 +18,22 @@ public class DocumentAnalyzerTest {
|
|||
@Test
|
||||
public void testDocument() throws IOException {
|
||||
Path tmp = Files.createTempDirectory("document-analyzer");
|
||||
String sample = "antonio_sample.pdf";
|
||||
String sample = "20200000063.pdf";
|
||||
Path path = tmp.resolve(sample);
|
||||
try (InputStream inputStream = getClass().getResourceAsStream(sample);
|
||||
OutputStream outputStream = Files.newOutputStream(path)) {
|
||||
if (inputStream != null) {
|
||||
inputStream.transferTo(outputStream);
|
||||
DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer(path.toFile());
|
||||
DocumentAnalyzer documentAnalyzer = new DocumentAnalyzer();
|
||||
documentAnalyzer.process(path.toFile());
|
||||
logger.log(Level.INFO, "result = " + documentAnalyzer.getResult());
|
||||
logger.log(Level.INFO, "isvalid = " + documentAnalyzer.isValid());
|
||||
logger.log(Level.INFO, "iscolor = " + documentAnalyzer.isColor());
|
||||
logger.log(Level.INFO, "isgray = " + documentAnalyzer.isGray());
|
||||
logger.log(Level.INFO, "isA4 = " + documentAnalyzer.isA4());
|
||||
logger.log(Level.INFO, "isLetter = " + documentAnalyzer.isLetter());
|
||||
logger.log(Level.INFO, "islandscape = " + documentAnalyzer.isLandscape());
|
||||
logger.log(Level.INFO, "isimage = " + documentAnalyzer.isImage());
|
||||
}
|
||||
}
|
||||
Files.delete(path);
|
||||
|
|
Loading…
Reference in a new issue