allow value transforming in JSON writer, add buffer size

This commit is contained in:
Jörg Prante 2016-09-21 17:11:28 +02:00
parent ffcb74abcf
commit 09e8bedebe
7 changed files with 180 additions and 91 deletions

View file

@ -1,3 +1,3 @@
group = org.xbib
version = 1.0.2
version = 1.0.3
org.gradle.daemon = true

View file

@ -21,10 +21,12 @@ import org.xbib.marc.MarcField;
import org.xbib.marc.MarcListener;
import org.xbib.marc.MarcRecord;
import org.xbib.marc.label.RecordLabel;
import org.xbib.marc.transformer.value.MarcValueTransformers;
import org.xbib.marc.xml.MarcContentHandler;
import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.FileWriter;
import java.io.Flushable;
import java.io.IOException;
import java.io.OutputStream;
@ -32,8 +34,6 @@ import java.io.OutputStreamWriter;
import java.io.UncheckedIOException;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@ -42,6 +42,8 @@ import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This Marc Writer is a MarcContentHandler that writes Marc events to JSON.
@ -50,6 +52,8 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName());
private static final int DEFAULT_BUFFER_SIZE = 8192;
public static final String LEADER_TAG = "_LEADER";
public static final String FORMAT_TAG = "_FORMAT";
@ -76,6 +80,8 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
private int splitlimit;
private int bufferSize;
/**
* Flag for indicating if writer is at top of file.
*/
@ -86,15 +92,20 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
}
public MarcJsonWriter(OutputStream out, boolean jsonlines) throws IOException {
this(new OutputStreamWriter(out, StandardCharsets.UTF_8), jsonlines);
this(out, DEFAULT_BUFFER_SIZE, jsonlines);
}
public MarcJsonWriter(OutputStream out, int bufferSize, boolean jsonlines) throws IOException {
this(new OutputStreamWriter(out, StandardCharsets.UTF_8), bufferSize, jsonlines);
}
public MarcJsonWriter(Writer writer) throws IOException {
this(writer, false);
this(writer, DEFAULT_BUFFER_SIZE, false);
}
public MarcJsonWriter(Writer writer, boolean jsonlines) throws IOException {
this.writer = new BufferedWriter(writer);
public MarcJsonWriter(Writer writer, int bufferSize, boolean jsonlines) throws IOException {
this.writer = new BufferedWriter(writer, bufferSize);
this.bufferSize = bufferSize;
this.jsonlines = jsonlines;
this.lock = new ReentrantLock();
this.sb = new StringBuilder();
@ -103,10 +114,15 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
}
public MarcJsonWriter(String fileNamePattern, int splitlimit) throws IOException {
this(fileNamePattern, DEFAULT_BUFFER_SIZE, splitlimit);
}
public MarcJsonWriter(String fileNamePattern, int bufferSize, int splitlimit) throws IOException {
this.fileNameCounter = new AtomicInteger(0);
this.fileNamePattern = fileNamePattern;
this.splitlimit = splitlimit;
this.writer = newWriter(fileNamePattern, fileNameCounter);
this.writer = newWriter(fileNamePattern, fileNameCounter, bufferSize);
this.bufferSize = bufferSize;
this.lock = new ReentrantLock();
this.sb = new StringBuilder();
this.builder = Marc.builder();
@ -114,10 +130,6 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
this.jsonlines = true;
}
private static String escape(String value) {
return value != null ? value.replaceAll("\"", "\\\"") : null;
}
public MarcJsonWriter setFatalErrors(boolean fatalErrors) {
this.fatalErrors = fatalErrors;
return this;
@ -129,6 +141,11 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
return this;
}
public MarcJsonWriter setMarcValueTransformers(MarcValueTransformers marcValueTransformers) {
super.setMarcValueTransformers(marcValueTransformers);
return this;
}
@Override
public MarcJsonWriter setFormat(String format) {
super.setFormat(format);
@ -171,7 +188,11 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
@Override
public void field(MarcField field) {
super.field(field);
builder.addField(field);
MarcField marcField = field;
if (marcValueTransformers != null) {
marcField = marcValueTransformers.transformValue(field);
}
builder.addField(marcField);
}
@Override
@ -387,7 +408,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
try {
endCollection();
close();
writer = newWriter(fileNamePattern, fileNameCounter);
writer = newWriter(fileNamePattern, fileNameCounter, bufferSize);
top = true;
beginCollection();
} catch (IOException e) {
@ -397,8 +418,18 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
}
}
private static BufferedWriter newWriter(String fileNamePattern, AtomicInteger fileNameCounter) throws IOException {
return Files.newBufferedWriter(Paths.get(String.format(fileNamePattern, fileNameCounter.getAndIncrement())));
private static BufferedWriter newWriter(String fileNamePattern, AtomicInteger fileNameCounter, int bufferSize)
throws IOException {
String s = String.format(fileNamePattern, fileNameCounter.getAndIncrement());
return new BufferedWriter(new FileWriter(s), bufferSize);
}
private static final Pattern p = Pattern.compile("\"", Pattern.LITERAL);
private static final String replacement = "\\\"";
private static String escape(String value) {
return p.matcher(value).replaceAll(Matcher.quoteReplacement(replacement));
}
}

View file

@ -58,6 +58,9 @@ public class MarcValueTransformers {
if (transformer != null) {
MarcField.Builder builder = MarcField.builder();
builder.tag(field.getTag()).indicator(field.getIndicator());
if (field.getValue() != null) {
builder.value(transformer.transform(field.getValue()));
}
field.getSubfields().forEach(subfield ->
builder.subfield(subfield.getId(), transformer.transform(subfield.getValue())));
return builder.build();

View file

@ -179,77 +179,6 @@ public class MarcTest extends Assert {
recordIDs.toString());
}
/**
* ZDB MARC Bibliographic.
*/
@Test
public void testZDBBib() throws Exception {
String s = "zdbtitutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
File file = File.createTempFile(s + ".", ".xml");
file.deleteOnExit();
FileOutputStream out = new FileOutputStream(file);
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC));
try (MarcXchangeWriter writer = new MarcXchangeWriter(out)
.setMarcValueTransformers(marcValueTransformers)) {
Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.setMarcListener(writer)
.build()
.writeCollection();
assertNull(writer.getException());
}
assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + ".xml").openStream()));
}
@Test
public void testZDBStream() throws IOException {
String s = "zdblokutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
long count = Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.build().iso2709Stream().chunks().count();
in.close();
assertEquals(10170L, count);
in = getClass().getResource(s).openStream();
Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.build().iso2709Stream().chunks()
.forEach(chunk -> assertTrue(chunk.data().length() >= 0));
in.close();
}
/**
* ZDB MARC Holdings.
*/
@Test
public void testZDBLok() throws Exception {
String s = "zdblokutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
File file = File.createTempFile(s + ".", ".xml");
file.deleteOnExit();
FileOutputStream out = new FileOutputStream(file);
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC));
try (MarcXchangeWriter writer = new MarcXchangeWriter(out)
.setMarcValueTransformers(marcValueTransformers)) {
Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.setMarcListener(writer)
.build()
.writeCollection();
assertNull(writer.getException());
}
assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + ".xml").openStream()));
}
/**
* Ther may be faulty input streams that contain information separators at the wrong place.
* For the problem, see {@code org.marc4j.test.PermissiveReaderTest#testCyrillicEFix()}.

View file

@ -16,13 +16,26 @@
*/
package org.xbib.marc;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.xbib.helper.StreamMatcher.assertStream;
import org.junit.Test;
import org.xbib.marc.json.MarcJsonWriter;
import org.xbib.marc.transformer.value.MarcValueTransformers;
import org.xbib.marc.xml.MarcXchangeWriter;
import org.xmlunit.matchers.CompareMatcher;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
/**
*
@ -99,4 +112,72 @@ public class ZDBTest {
}
}
/**
* ZDB MARC Bibliographic.
*/
@Test
public void testZDBBib() throws Exception {
String s = "zdbtitutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
File file = File.createTempFile(s, ".json");
file.deleteOnExit();
OutputStream out = new FileOutputStream(file);
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC));
try (MarcJsonWriter writer = new MarcJsonWriter(out, true)
.setFormat(MarcXchangeConstants.MARCXCHANGE_FORMAT)
.setType(MarcXchangeConstants.BIBLIOGRAPHIC_TYPE)
.setMarcValueTransformers(marcValueTransformers)) {
Marc.builder()
.setInputStream(in)
.setMarcListener(writer)
.build()
.writeCollection();
assertNull(writer.getException());
}
}
@Test
public void testZDBStream() throws IOException {
String s = "zdblokutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
long count = Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.build().iso2709Stream().chunks().count();
in.close();
assertEquals(10170L, count);
in = getClass().getResource(s).openStream();
Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.build().iso2709Stream().chunks()
.forEach(chunk -> assertTrue(chunk.data().length() >= 0));
in.close();
}
@Test
public void testZDBLok() throws Exception {
String s = "zdblokutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
File file = File.createTempFile(s + ".", ".xml");
file.deleteOnExit();
FileOutputStream out = new FileOutputStream(file);
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC));
try (MarcXchangeWriter writer = new MarcXchangeWriter(out)
.setMarcValueTransformers(marcValueTransformers)) {
Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.setMarcListener(writer)
.build()
.writeCollection();
assertNull(writer.getException());
}
assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + ".xml").openStream()));
}
}

View file

@ -178,13 +178,13 @@ public class MarcJsonWriterTest {
.writeCollection();
assertEquals(10, writer.getRecordCounter());
File f0 = new File("build/0.json");
assertTrue(f0.exists() && f0.length() == 6022);
assertTrue(f0.exists() && f0.length() == 6015);
File f1 = new File("build/1.json");
assertTrue(f1.exists() && f1.length() == 7150);
assertTrue(f1.exists() && f1.length() == 7127);
File f2 = new File("build/2.json");
assertTrue(f2.exists() && f2.length() == 6424);
assertTrue(f2.exists() && f2.length() == 6426);
File f3 = new File("build/3.json");
assertTrue(f3.exists() && f3.length() == 2114);
assertTrue(f3.exists() && f3.length() == 2110);
File f4 = new File("build/4.json");
assertFalse(f4.exists());
}

View file

@ -0,0 +1,45 @@
package org.xbib.marc.transformer;
import static org.junit.Assert.assertEquals;
import org.junit.Test;
import org.xbib.marc.MarcField;
import org.xbib.marc.transformer.value.MarcValueTransformer;
import org.xbib.marc.transformer.value.MarcValueTransformers;
/**
*
*/
public class MarcValueTransformerTest {
@Test
public void testValueTransformer() {
MarcValueTransformer marcValueTransformer = new MarcValueTransformer() {
@Override
public String transform(String value) {
return value.equals("World") ? "Earth" : value;
}
};
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(marcValueTransformer);
MarcField a = MarcField.builder().tag("100").subfield("a", "Hello").subfield("b", "World").build();
MarcField b = marcValueTransformers.transformValue(a);
assertEquals("100$$ab[a=Hello, b=Earth]", b.toString());
}
@Test
public void testValueControlFieldTransformer() {
MarcValueTransformer marcValueTransformer = new MarcValueTransformer() {
@Override
public String transform(String value) {
return value.equals("World") ? "Earth" : value;
}
};
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(marcValueTransformer);
MarcField a = MarcField.builder().tag("001").value("World").build();
MarcField b = marcValueTransformers.transformValue(a);
assertEquals("001$$Earth", b.toString());
}
}