From 09e8bedebe1f37f0564e9206558d3883d92ffadf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=CC=88rg=20Prante?= Date: Wed, 21 Sep 2016 17:11:28 +0200 Subject: [PATCH] allow value transforming in JSON writer, add buffer size --- gradle.properties | 2 +- .../org/xbib/marc/json/MarcJsonWriter.java | 61 ++++++++++---- .../value/MarcValueTransformers.java | 3 + src/test/java/org/xbib/marc/MarcTest.java | 71 ---------------- src/test/java/org/xbib/marc/ZDBTest.java | 81 +++++++++++++++++++ .../xbib/marc/json/MarcJsonWriterTest.java | 8 +- .../transformer/MarcValueTransformerTest.java | 45 +++++++++++ 7 files changed, 180 insertions(+), 91 deletions(-) create mode 100644 src/test/java/org/xbib/marc/transformer/MarcValueTransformerTest.java diff --git a/gradle.properties b/gradle.properties index d4178b8..b4da824 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,3 +1,3 @@ group = org.xbib -version = 1.0.2 +version = 1.0.3 org.gradle.daemon = true diff --git a/src/main/java/org/xbib/marc/json/MarcJsonWriter.java b/src/main/java/org/xbib/marc/json/MarcJsonWriter.java index 1e1142a..395923c 100644 --- a/src/main/java/org/xbib/marc/json/MarcJsonWriter.java +++ b/src/main/java/org/xbib/marc/json/MarcJsonWriter.java @@ -21,10 +21,12 @@ import org.xbib.marc.MarcField; import org.xbib.marc.MarcListener; import org.xbib.marc.MarcRecord; import org.xbib.marc.label.RecordLabel; +import org.xbib.marc.transformer.value.MarcValueTransformers; import org.xbib.marc.xml.MarcContentHandler; import java.io.BufferedWriter; import java.io.Closeable; +import java.io.FileWriter; import java.io.Flushable; import java.io.IOException; import java.io.OutputStream; @@ -32,8 +34,6 @@ import java.io.OutputStreamWriter; import java.io.UncheckedIOException; import java.io.Writer; import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Paths; import java.util.Collections; import java.util.List; import java.util.Map; @@ -42,6 +42,8 @@ import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * This Marc Writer is a MarcContentHandler that writes Marc events to JSON. @@ -50,6 +52,8 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName()); + private static final int DEFAULT_BUFFER_SIZE = 8192; + public static final String LEADER_TAG = "_LEADER"; public static final String FORMAT_TAG = "_FORMAT"; @@ -76,6 +80,8 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo private int splitlimit; + private int bufferSize; + /** * Flag for indicating if writer is at top of file. */ @@ -86,15 +92,20 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo } public MarcJsonWriter(OutputStream out, boolean jsonlines) throws IOException { - this(new OutputStreamWriter(out, StandardCharsets.UTF_8), jsonlines); + this(out, DEFAULT_BUFFER_SIZE, jsonlines); + } + + public MarcJsonWriter(OutputStream out, int bufferSize, boolean jsonlines) throws IOException { + this(new OutputStreamWriter(out, StandardCharsets.UTF_8), bufferSize, jsonlines); } public MarcJsonWriter(Writer writer) throws IOException { - this(writer, false); + this(writer, DEFAULT_BUFFER_SIZE, false); } - public MarcJsonWriter(Writer writer, boolean jsonlines) throws IOException { - this.writer = new BufferedWriter(writer); + public MarcJsonWriter(Writer writer, int bufferSize, boolean jsonlines) throws IOException { + this.writer = new BufferedWriter(writer, bufferSize); + this.bufferSize = bufferSize; this.jsonlines = jsonlines; this.lock = new ReentrantLock(); this.sb = new StringBuilder(); @@ -103,10 +114,15 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo } public MarcJsonWriter(String fileNamePattern, int splitlimit) throws IOException { + this(fileNamePattern, DEFAULT_BUFFER_SIZE, splitlimit); + } + + public MarcJsonWriter(String fileNamePattern, int bufferSize, int splitlimit) throws IOException { this.fileNameCounter = new AtomicInteger(0); this.fileNamePattern = fileNamePattern; this.splitlimit = splitlimit; - this.writer = newWriter(fileNamePattern, fileNameCounter); + this.writer = newWriter(fileNamePattern, fileNameCounter, bufferSize); + this.bufferSize = bufferSize; this.lock = new ReentrantLock(); this.sb = new StringBuilder(); this.builder = Marc.builder(); @@ -114,10 +130,6 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo this.jsonlines = true; } - private static String escape(String value) { - return value != null ? value.replaceAll("\"", "\\\"") : null; - } - public MarcJsonWriter setFatalErrors(boolean fatalErrors) { this.fatalErrors = fatalErrors; return this; @@ -129,6 +141,11 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo return this; } + public MarcJsonWriter setMarcValueTransformers(MarcValueTransformers marcValueTransformers) { + super.setMarcValueTransformers(marcValueTransformers); + return this; + } + @Override public MarcJsonWriter setFormat(String format) { super.setFormat(format); @@ -171,7 +188,11 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo @Override public void field(MarcField field) { super.field(field); - builder.addField(field); + MarcField marcField = field; + if (marcValueTransformers != null) { + marcField = marcValueTransformers.transformValue(field); + } + builder.addField(marcField); } @Override @@ -387,7 +408,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo try { endCollection(); close(); - writer = newWriter(fileNamePattern, fileNameCounter); + writer = newWriter(fileNamePattern, fileNameCounter, bufferSize); top = true; beginCollection(); } catch (IOException e) { @@ -397,8 +418,18 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo } } - private static BufferedWriter newWriter(String fileNamePattern, AtomicInteger fileNameCounter) throws IOException { - return Files.newBufferedWriter(Paths.get(String.format(fileNamePattern, fileNameCounter.getAndIncrement()))); + private static BufferedWriter newWriter(String fileNamePattern, AtomicInteger fileNameCounter, int bufferSize) + throws IOException { + String s = String.format(fileNamePattern, fileNameCounter.getAndIncrement()); + return new BufferedWriter(new FileWriter(s), bufferSize); + } + + private static final Pattern p = Pattern.compile("\"", Pattern.LITERAL); + + private static final String replacement = "\\\""; + + private static String escape(String value) { + return p.matcher(value).replaceAll(Matcher.quoteReplacement(replacement)); } } diff --git a/src/main/java/org/xbib/marc/transformer/value/MarcValueTransformers.java b/src/main/java/org/xbib/marc/transformer/value/MarcValueTransformers.java index 3332f3f..6e2382c 100644 --- a/src/main/java/org/xbib/marc/transformer/value/MarcValueTransformers.java +++ b/src/main/java/org/xbib/marc/transformer/value/MarcValueTransformers.java @@ -58,6 +58,9 @@ public class MarcValueTransformers { if (transformer != null) { MarcField.Builder builder = MarcField.builder(); builder.tag(field.getTag()).indicator(field.getIndicator()); + if (field.getValue() != null) { + builder.value(transformer.transform(field.getValue())); + } field.getSubfields().forEach(subfield -> builder.subfield(subfield.getId(), transformer.transform(subfield.getValue()))); return builder.build(); diff --git a/src/test/java/org/xbib/marc/MarcTest.java b/src/test/java/org/xbib/marc/MarcTest.java index 97a9452..4a8dfcd 100644 --- a/src/test/java/org/xbib/marc/MarcTest.java +++ b/src/test/java/org/xbib/marc/MarcTest.java @@ -179,77 +179,6 @@ public class MarcTest extends Assert { recordIDs.toString()); } - /** - * ZDB MARC Bibliographic. - */ - @Test - public void testZDBBib() throws Exception { - String s = "zdbtitutf8.mrc"; - InputStream in = getClass().getResource(s).openStream(); - File file = File.createTempFile(s + ".", ".xml"); - file.deleteOnExit(); - FileOutputStream out = new FileOutputStream(file); - MarcValueTransformers marcValueTransformers = new MarcValueTransformers(); - marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC)); - try (MarcXchangeWriter writer = new MarcXchangeWriter(out) - .setMarcValueTransformers(marcValueTransformers)) { - Marc.builder() - .setInputStream(in) - .setCharset(StandardCharsets.UTF_8) - .setMarcListener(writer) - .build() - .writeCollection(); - assertNull(writer.getException()); - } - assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + ".xml").openStream())); - } - - @Test - public void testZDBStream() throws IOException { - String s = "zdblokutf8.mrc"; - InputStream in = getClass().getResource(s).openStream(); - long count = Marc.builder() - .setInputStream(in) - .setCharset(StandardCharsets.UTF_8) - .build().iso2709Stream().chunks().count(); - in.close(); - assertEquals(10170L, count); - - in = getClass().getResource(s).openStream(); - Marc.builder() - .setInputStream(in) - .setCharset(StandardCharsets.UTF_8) - .build().iso2709Stream().chunks() - .forEach(chunk -> assertTrue(chunk.data().length() >= 0)); - in.close(); - } - - /** - * ZDB MARC Holdings. - */ - @Test - public void testZDBLok() throws Exception { - String s = "zdblokutf8.mrc"; - InputStream in = getClass().getResource(s).openStream(); - File file = File.createTempFile(s + ".", ".xml"); - file.deleteOnExit(); - FileOutputStream out = new FileOutputStream(file); - MarcValueTransformers marcValueTransformers = new MarcValueTransformers(); - marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC)); - try (MarcXchangeWriter writer = new MarcXchangeWriter(out) - .setMarcValueTransformers(marcValueTransformers)) { - Marc.builder() - .setInputStream(in) - .setCharset(StandardCharsets.UTF_8) - .setMarcListener(writer) - .build() - .writeCollection(); - assertNull(writer.getException()); - } - assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + ".xml").openStream())); - } - - /** * Ther may be faulty input streams that contain information separators at the wrong place. * For the problem, see {@code org.marc4j.test.PermissiveReaderTest#testCyrillicEFix()}. diff --git a/src/test/java/org/xbib/marc/ZDBTest.java b/src/test/java/org/xbib/marc/ZDBTest.java index d80482d..bc43909 100644 --- a/src/test/java/org/xbib/marc/ZDBTest.java +++ b/src/test/java/org/xbib/marc/ZDBTest.java @@ -16,13 +16,26 @@ */ package org.xbib.marc; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; import static org.xbib.helper.StreamMatcher.assertStream; import org.junit.Test; +import org.xbib.marc.json.MarcJsonWriter; +import org.xbib.marc.transformer.value.MarcValueTransformers; +import org.xbib.marc.xml.MarcXchangeWriter; +import org.xmlunit.matchers.CompareMatcher; import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.nio.charset.StandardCharsets; +import java.text.Normalizer; /** * @@ -99,4 +112,72 @@ public class ZDBTest { } } + + /** + * ZDB MARC Bibliographic. + */ + + @Test + public void testZDBBib() throws Exception { + String s = "zdbtitutf8.mrc"; + InputStream in = getClass().getResource(s).openStream(); + File file = File.createTempFile(s, ".json"); + file.deleteOnExit(); + OutputStream out = new FileOutputStream(file); + MarcValueTransformers marcValueTransformers = new MarcValueTransformers(); + marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC)); + try (MarcJsonWriter writer = new MarcJsonWriter(out, true) + .setFormat(MarcXchangeConstants.MARCXCHANGE_FORMAT) + .setType(MarcXchangeConstants.BIBLIOGRAPHIC_TYPE) + .setMarcValueTransformers(marcValueTransformers)) { + Marc.builder() + .setInputStream(in) + .setMarcListener(writer) + .build() + .writeCollection(); + assertNull(writer.getException()); + } + } + + @Test + public void testZDBStream() throws IOException { + String s = "zdblokutf8.mrc"; + InputStream in = getClass().getResource(s).openStream(); + long count = Marc.builder() + .setInputStream(in) + .setCharset(StandardCharsets.UTF_8) + .build().iso2709Stream().chunks().count(); + in.close(); + assertEquals(10170L, count); + + in = getClass().getResource(s).openStream(); + Marc.builder() + .setInputStream(in) + .setCharset(StandardCharsets.UTF_8) + .build().iso2709Stream().chunks() + .forEach(chunk -> assertTrue(chunk.data().length() >= 0)); + in.close(); + } + + @Test + public void testZDBLok() throws Exception { + String s = "zdblokutf8.mrc"; + InputStream in = getClass().getResource(s).openStream(); + File file = File.createTempFile(s + ".", ".xml"); + file.deleteOnExit(); + FileOutputStream out = new FileOutputStream(file); + MarcValueTransformers marcValueTransformers = new MarcValueTransformers(); + marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC)); + try (MarcXchangeWriter writer = new MarcXchangeWriter(out) + .setMarcValueTransformers(marcValueTransformers)) { + Marc.builder() + .setInputStream(in) + .setCharset(StandardCharsets.UTF_8) + .setMarcListener(writer) + .build() + .writeCollection(); + assertNull(writer.getException()); + } + assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + ".xml").openStream())); + } } diff --git a/src/test/java/org/xbib/marc/json/MarcJsonWriterTest.java b/src/test/java/org/xbib/marc/json/MarcJsonWriterTest.java index d437609..8f13c42 100644 --- a/src/test/java/org/xbib/marc/json/MarcJsonWriterTest.java +++ b/src/test/java/org/xbib/marc/json/MarcJsonWriterTest.java @@ -178,13 +178,13 @@ public class MarcJsonWriterTest { .writeCollection(); assertEquals(10, writer.getRecordCounter()); File f0 = new File("build/0.json"); - assertTrue(f0.exists() && f0.length() == 6022); + assertTrue(f0.exists() && f0.length() == 6015); File f1 = new File("build/1.json"); - assertTrue(f1.exists() && f1.length() == 7150); + assertTrue(f1.exists() && f1.length() == 7127); File f2 = new File("build/2.json"); - assertTrue(f2.exists() && f2.length() == 6424); + assertTrue(f2.exists() && f2.length() == 6426); File f3 = new File("build/3.json"); - assertTrue(f3.exists() && f3.length() == 2114); + assertTrue(f3.exists() && f3.length() == 2110); File f4 = new File("build/4.json"); assertFalse(f4.exists()); } diff --git a/src/test/java/org/xbib/marc/transformer/MarcValueTransformerTest.java b/src/test/java/org/xbib/marc/transformer/MarcValueTransformerTest.java new file mode 100644 index 0000000..4abdd2a --- /dev/null +++ b/src/test/java/org/xbib/marc/transformer/MarcValueTransformerTest.java @@ -0,0 +1,45 @@ +package org.xbib.marc.transformer; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; +import org.xbib.marc.MarcField; +import org.xbib.marc.transformer.value.MarcValueTransformer; +import org.xbib.marc.transformer.value.MarcValueTransformers; + +/** + * + */ +public class MarcValueTransformerTest { + + @Test + public void testValueTransformer() { + MarcValueTransformer marcValueTransformer = new MarcValueTransformer() { + @Override + public String transform(String value) { + return value.equals("World") ? "Earth" : value; + } + }; + MarcValueTransformers marcValueTransformers = new MarcValueTransformers(); + marcValueTransformers.setMarcValueTransformer(marcValueTransformer); + MarcField a = MarcField.builder().tag("100").subfield("a", "Hello").subfield("b", "World").build(); + MarcField b = marcValueTransformers.transformValue(a); + assertEquals("100$$ab[a=Hello, b=Earth]", b.toString()); + } + + @Test + public void testValueControlFieldTransformer() { + MarcValueTransformer marcValueTransformer = new MarcValueTransformer() { + @Override + public String transform(String value) { + return value.equals("World") ? "Earth" : value; + } + }; + MarcValueTransformers marcValueTransformers = new MarcValueTransformers(); + marcValueTransformers.setMarcValueTransformer(marcValueTransformer); + MarcField a = MarcField.builder().tag("001").value("World").build(); + MarcField b = marcValueTransformers.transformValue(a); + assertEquals("001$$Earth", b.toString()); + } + +}