allow value transforming in JSON writer, add buffer size

This commit is contained in:
Jörg Prante 2016-09-21 17:11:28 +02:00
parent ffcb74abcf
commit 09e8bedebe
7 changed files with 180 additions and 91 deletions

View file

@ -1,3 +1,3 @@
group = org.xbib group = org.xbib
version = 1.0.2 version = 1.0.3
org.gradle.daemon = true org.gradle.daemon = true

View file

@ -21,10 +21,12 @@ import org.xbib.marc.MarcField;
import org.xbib.marc.MarcListener; import org.xbib.marc.MarcListener;
import org.xbib.marc.MarcRecord; import org.xbib.marc.MarcRecord;
import org.xbib.marc.label.RecordLabel; import org.xbib.marc.label.RecordLabel;
import org.xbib.marc.transformer.value.MarcValueTransformers;
import org.xbib.marc.xml.MarcContentHandler; import org.xbib.marc.xml.MarcContentHandler;
import java.io.BufferedWriter; import java.io.BufferedWriter;
import java.io.Closeable; import java.io.Closeable;
import java.io.FileWriter;
import java.io.Flushable; import java.io.Flushable;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStream; import java.io.OutputStream;
@ -32,8 +34,6 @@ import java.io.OutputStreamWriter;
import java.io.UncheckedIOException; import java.io.UncheckedIOException;
import java.io.Writer; import java.io.Writer;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -42,6 +42,8 @@ import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock; import java.util.concurrent.locks.ReentrantLock;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.logging.Logger; import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** /**
* This Marc Writer is a MarcContentHandler that writes Marc events to JSON. * This Marc Writer is a MarcContentHandler that writes Marc events to JSON.
@ -50,6 +52,8 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName()); private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName());
private static final int DEFAULT_BUFFER_SIZE = 8192;
public static final String LEADER_TAG = "_LEADER"; public static final String LEADER_TAG = "_LEADER";
public static final String FORMAT_TAG = "_FORMAT"; public static final String FORMAT_TAG = "_FORMAT";
@ -76,6 +80,8 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
private int splitlimit; private int splitlimit;
private int bufferSize;
/** /**
* Flag for indicating if writer is at top of file. * Flag for indicating if writer is at top of file.
*/ */
@ -86,15 +92,20 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
} }
public MarcJsonWriter(OutputStream out, boolean jsonlines) throws IOException { public MarcJsonWriter(OutputStream out, boolean jsonlines) throws IOException {
this(new OutputStreamWriter(out, StandardCharsets.UTF_8), jsonlines); this(out, DEFAULT_BUFFER_SIZE, jsonlines);
}
public MarcJsonWriter(OutputStream out, int bufferSize, boolean jsonlines) throws IOException {
this(new OutputStreamWriter(out, StandardCharsets.UTF_8), bufferSize, jsonlines);
} }
public MarcJsonWriter(Writer writer) throws IOException { public MarcJsonWriter(Writer writer) throws IOException {
this(writer, false); this(writer, DEFAULT_BUFFER_SIZE, false);
} }
public MarcJsonWriter(Writer writer, boolean jsonlines) throws IOException { public MarcJsonWriter(Writer writer, int bufferSize, boolean jsonlines) throws IOException {
this.writer = new BufferedWriter(writer); this.writer = new BufferedWriter(writer, bufferSize);
this.bufferSize = bufferSize;
this.jsonlines = jsonlines; this.jsonlines = jsonlines;
this.lock = new ReentrantLock(); this.lock = new ReentrantLock();
this.sb = new StringBuilder(); this.sb = new StringBuilder();
@ -103,10 +114,15 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
} }
public MarcJsonWriter(String fileNamePattern, int splitlimit) throws IOException { public MarcJsonWriter(String fileNamePattern, int splitlimit) throws IOException {
this(fileNamePattern, DEFAULT_BUFFER_SIZE, splitlimit);
}
public MarcJsonWriter(String fileNamePattern, int bufferSize, int splitlimit) throws IOException {
this.fileNameCounter = new AtomicInteger(0); this.fileNameCounter = new AtomicInteger(0);
this.fileNamePattern = fileNamePattern; this.fileNamePattern = fileNamePattern;
this.splitlimit = splitlimit; this.splitlimit = splitlimit;
this.writer = newWriter(fileNamePattern, fileNameCounter); this.writer = newWriter(fileNamePattern, fileNameCounter, bufferSize);
this.bufferSize = bufferSize;
this.lock = new ReentrantLock(); this.lock = new ReentrantLock();
this.sb = new StringBuilder(); this.sb = new StringBuilder();
this.builder = Marc.builder(); this.builder = Marc.builder();
@ -114,10 +130,6 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
this.jsonlines = true; this.jsonlines = true;
} }
private static String escape(String value) {
return value != null ? value.replaceAll("\"", "\\\"") : null;
}
public MarcJsonWriter setFatalErrors(boolean fatalErrors) { public MarcJsonWriter setFatalErrors(boolean fatalErrors) {
this.fatalErrors = fatalErrors; this.fatalErrors = fatalErrors;
return this; return this;
@ -129,6 +141,11 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
return this; return this;
} }
public MarcJsonWriter setMarcValueTransformers(MarcValueTransformers marcValueTransformers) {
super.setMarcValueTransformers(marcValueTransformers);
return this;
}
@Override @Override
public MarcJsonWriter setFormat(String format) { public MarcJsonWriter setFormat(String format) {
super.setFormat(format); super.setFormat(format);
@ -171,7 +188,11 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
@Override @Override
public void field(MarcField field) { public void field(MarcField field) {
super.field(field); super.field(field);
builder.addField(field); MarcField marcField = field;
if (marcValueTransformers != null) {
marcField = marcValueTransformers.transformValue(field);
}
builder.addField(marcField);
} }
@Override @Override
@ -387,7 +408,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
try { try {
endCollection(); endCollection();
close(); close();
writer = newWriter(fileNamePattern, fileNameCounter); writer = newWriter(fileNamePattern, fileNameCounter, bufferSize);
top = true; top = true;
beginCollection(); beginCollection();
} catch (IOException e) { } catch (IOException e) {
@ -397,8 +418,18 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
} }
} }
private static BufferedWriter newWriter(String fileNamePattern, AtomicInteger fileNameCounter) throws IOException { private static BufferedWriter newWriter(String fileNamePattern, AtomicInteger fileNameCounter, int bufferSize)
return Files.newBufferedWriter(Paths.get(String.format(fileNamePattern, fileNameCounter.getAndIncrement()))); throws IOException {
String s = String.format(fileNamePattern, fileNameCounter.getAndIncrement());
return new BufferedWriter(new FileWriter(s), bufferSize);
}
private static final Pattern p = Pattern.compile("\"", Pattern.LITERAL);
private static final String replacement = "\\\"";
private static String escape(String value) {
return p.matcher(value).replaceAll(Matcher.quoteReplacement(replacement));
} }
} }

View file

@ -58,6 +58,9 @@ public class MarcValueTransformers {
if (transformer != null) { if (transformer != null) {
MarcField.Builder builder = MarcField.builder(); MarcField.Builder builder = MarcField.builder();
builder.tag(field.getTag()).indicator(field.getIndicator()); builder.tag(field.getTag()).indicator(field.getIndicator());
if (field.getValue() != null) {
builder.value(transformer.transform(field.getValue()));
}
field.getSubfields().forEach(subfield -> field.getSubfields().forEach(subfield ->
builder.subfield(subfield.getId(), transformer.transform(subfield.getValue()))); builder.subfield(subfield.getId(), transformer.transform(subfield.getValue())));
return builder.build(); return builder.build();

View file

@ -179,77 +179,6 @@ public class MarcTest extends Assert {
recordIDs.toString()); recordIDs.toString());
} }
/**
* ZDB MARC Bibliographic.
*/
@Test
public void testZDBBib() throws Exception {
String s = "zdbtitutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
File file = File.createTempFile(s + ".", ".xml");
file.deleteOnExit();
FileOutputStream out = new FileOutputStream(file);
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC));
try (MarcXchangeWriter writer = new MarcXchangeWriter(out)
.setMarcValueTransformers(marcValueTransformers)) {
Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.setMarcListener(writer)
.build()
.writeCollection();
assertNull(writer.getException());
}
assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + ".xml").openStream()));
}
@Test
public void testZDBStream() throws IOException {
String s = "zdblokutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
long count = Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.build().iso2709Stream().chunks().count();
in.close();
assertEquals(10170L, count);
in = getClass().getResource(s).openStream();
Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.build().iso2709Stream().chunks()
.forEach(chunk -> assertTrue(chunk.data().length() >= 0));
in.close();
}
/**
* ZDB MARC Holdings.
*/
@Test
public void testZDBLok() throws Exception {
String s = "zdblokutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
File file = File.createTempFile(s + ".", ".xml");
file.deleteOnExit();
FileOutputStream out = new FileOutputStream(file);
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC));
try (MarcXchangeWriter writer = new MarcXchangeWriter(out)
.setMarcValueTransformers(marcValueTransformers)) {
Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.setMarcListener(writer)
.build()
.writeCollection();
assertNull(writer.getException());
}
assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + ".xml").openStream()));
}
/** /**
* Ther may be faulty input streams that contain information separators at the wrong place. * Ther may be faulty input streams that contain information separators at the wrong place.
* For the problem, see {@code org.marc4j.test.PermissiveReaderTest#testCyrillicEFix()}. * For the problem, see {@code org.marc4j.test.PermissiveReaderTest#testCyrillicEFix()}.

View file

@ -16,13 +16,26 @@
*/ */
package org.xbib.marc; package org.xbib.marc;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.xbib.helper.StreamMatcher.assertStream; import static org.xbib.helper.StreamMatcher.assertStream;
import org.junit.Test; import org.junit.Test;
import org.xbib.marc.json.MarcJsonWriter;
import org.xbib.marc.transformer.value.MarcValueTransformers;
import org.xbib.marc.xml.MarcXchangeWriter;
import org.xmlunit.matchers.CompareMatcher;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
/** /**
* *
@ -99,4 +112,72 @@ public class ZDBTest {
} }
} }
/**
* ZDB MARC Bibliographic.
*/
@Test
public void testZDBBib() throws Exception {
String s = "zdbtitutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
File file = File.createTempFile(s, ".json");
file.deleteOnExit();
OutputStream out = new FileOutputStream(file);
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC));
try (MarcJsonWriter writer = new MarcJsonWriter(out, true)
.setFormat(MarcXchangeConstants.MARCXCHANGE_FORMAT)
.setType(MarcXchangeConstants.BIBLIOGRAPHIC_TYPE)
.setMarcValueTransformers(marcValueTransformers)) {
Marc.builder()
.setInputStream(in)
.setMarcListener(writer)
.build()
.writeCollection();
assertNull(writer.getException());
}
}
@Test
public void testZDBStream() throws IOException {
String s = "zdblokutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
long count = Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.build().iso2709Stream().chunks().count();
in.close();
assertEquals(10170L, count);
in = getClass().getResource(s).openStream();
Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.build().iso2709Stream().chunks()
.forEach(chunk -> assertTrue(chunk.data().length() >= 0));
in.close();
}
@Test
public void testZDBLok() throws Exception {
String s = "zdblokutf8.mrc";
InputStream in = getClass().getResource(s).openStream();
File file = File.createTempFile(s + ".", ".xml");
file.deleteOnExit();
FileOutputStream out = new FileOutputStream(file);
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(value -> Normalizer.normalize(value, Normalizer.Form.NFC));
try (MarcXchangeWriter writer = new MarcXchangeWriter(out)
.setMarcValueTransformers(marcValueTransformers)) {
Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.setMarcListener(writer)
.build()
.writeCollection();
assertNull(writer.getException());
}
assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + ".xml").openStream()));
}
} }

View file

@ -178,13 +178,13 @@ public class MarcJsonWriterTest {
.writeCollection(); .writeCollection();
assertEquals(10, writer.getRecordCounter()); assertEquals(10, writer.getRecordCounter());
File f0 = new File("build/0.json"); File f0 = new File("build/0.json");
assertTrue(f0.exists() && f0.length() == 6022); assertTrue(f0.exists() && f0.length() == 6015);
File f1 = new File("build/1.json"); File f1 = new File("build/1.json");
assertTrue(f1.exists() && f1.length() == 7150); assertTrue(f1.exists() && f1.length() == 7127);
File f2 = new File("build/2.json"); File f2 = new File("build/2.json");
assertTrue(f2.exists() && f2.length() == 6424); assertTrue(f2.exists() && f2.length() == 6426);
File f3 = new File("build/3.json"); File f3 = new File("build/3.json");
assertTrue(f3.exists() && f3.length() == 2114); assertTrue(f3.exists() && f3.length() == 2110);
File f4 = new File("build/4.json"); File f4 = new File("build/4.json");
assertFalse(f4.exists()); assertFalse(f4.exists());
} }

View file

@ -0,0 +1,45 @@
package org.xbib.marc.transformer;
import static org.junit.Assert.assertEquals;
import org.junit.Test;
import org.xbib.marc.MarcField;
import org.xbib.marc.transformer.value.MarcValueTransformer;
import org.xbib.marc.transformer.value.MarcValueTransformers;
/**
*
*/
public class MarcValueTransformerTest {
@Test
public void testValueTransformer() {
MarcValueTransformer marcValueTransformer = new MarcValueTransformer() {
@Override
public String transform(String value) {
return value.equals("World") ? "Earth" : value;
}
};
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(marcValueTransformer);
MarcField a = MarcField.builder().tag("100").subfield("a", "Hello").subfield("b", "World").build();
MarcField b = marcValueTransformers.transformValue(a);
assertEquals("100$$ab[a=Hello, b=Earth]", b.toString());
}
@Test
public void testValueControlFieldTransformer() {
MarcValueTransformer marcValueTransformer = new MarcValueTransformer() {
@Override
public String transform(String value) {
return value.equals("World") ? "Earth" : value;
}
};
MarcValueTransformers marcValueTransformers = new MarcValueTransformers();
marcValueTransformers.setMarcValueTransformer(marcValueTransformer);
MarcField a = MarcField.builder().tag("001").value("World").build();
MarcField b = marcValueTransformers.transformValue(a);
assertEquals("001$$Earth", b.toString());
}
}