add buffer size parameter, marc4j comparison test

This commit is contained in:
Jörg Prante 2020-05-02 18:49:19 +02:00
parent c8b8955303
commit b123d27567
19 changed files with 272 additions and 64 deletions

View file

@ -27,6 +27,7 @@ dependencies {
testImplementation("org.mockito:mockito-core:${project.property('mockito.version')}") {
exclude group: 'org.hamcrest'
}
testImplementation "org.marc4j:marc4j:${project.property('marc4j.version')}"
}
compileJava {

View file

@ -1,9 +1,9 @@
group = org.xbib
name = marc
version = 2.1.0
version = 2.2.0
# main
xbib-content.version = 2.0.4
xbib-content.version = 2.0.5
# runtime
xbib-bibliographic-character-sets.version = 1.0.0
@ -15,3 +15,4 @@ xalan.version = 2.7.2
xmlunit-matchers.version = 2.6.3
system-rules.version = 1.19.0
mockito.version = 3.1.0
marc4j.version = 2.9.1

View file

@ -87,6 +87,8 @@ public final class Marc {
private static final byte[] CRLF = { '\r', '\n'};
private static final int DEFAULT_BUFFER_SIZE = 8192;
private final Builder builder;
private Marc(Builder builder) {
@ -101,12 +103,17 @@ public final class Marc {
return new Builder();
}
public MarcIso2709Reader iso2709XmlReader() {
return iso2709XmlReader(DEFAULT_BUFFER_SIZE);
}
/**
* Return an XML reader on a ISO 2709 input stream.
* @param bufferSize buffer size for input stream
* @return XML reader
*/
public MarcIso2709Reader iso2709XmlReader() {
return new MarcIso2709Reader(builder);
public MarcIso2709Reader iso2709XmlReader(int bufferSize) {
return new MarcIso2709Reader(builder, bufferSize);
}
/**
@ -137,12 +144,17 @@ public final class Marc {
xmlEventReader.close();
}
public BufferedSeparatorInputStream iso2709Stream() {
return iso2709Stream(DEFAULT_BUFFER_SIZE);
}
/**
* Return ISO 2709 stream.
* @param bufferSize buffer size
* @return ISO 2709 stream
*/
public BufferedSeparatorInputStream iso2709Stream() {
return builder.iso2709Stream();
public BufferedSeparatorInputStream iso2709Stream(int bufferSize) {
return builder.iso2709Stream(bufferSize);
}
/**
@ -224,7 +236,8 @@ public final class Marc {
* @throws IOException if parsing fails
*/
public Document document() throws IOException {
return new Sax2Dom(iso2709XmlReader(), new InputSource(builder.getInputStream())).document();
return new Sax2Dom(iso2709XmlReader(DEFAULT_BUFFER_SIZE),
new InputSource(builder.getInputStream())).document();
}
/**
@ -260,16 +273,25 @@ public final class Marc {
}
}
public void writeCollection() throws IOException {
writeCollection(DEFAULT_BUFFER_SIZE);
}
/**
* Write MARC bibliographic data from seperator stream chunk by chunk to a MARC collection.
* @param bufferSize buffer size for the separator input stream
* @throws IOException if writing fails
*/
public void writeCollection() throws IOException {
wrapIntoCollection(new BufferedSeparatorInputStream(builder.getInputStream()));
public void writeCollection(int bufferSize) throws IOException {
wrapIntoCollection(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize));
}
public void writeCollection(String type) throws IOException {
wrapIntoCollection(type, new BufferedSeparatorInputStream(builder.getInputStream()));
writeCollection(type, DEFAULT_BUFFER_SIZE);
}
public void writeCollection(String type, int bufferSize) throws IOException {
wrapIntoCollection(type, new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize));
}
public int wrapIntoCollection(ChunkStream<byte[], BytesReference> stream) throws IOException {
@ -331,21 +353,31 @@ public final class Marc {
return count;
}
public void writeRecordCollection() throws IOException {
writeRecordCollection(DEFAULT_BUFFER_SIZE);
}
/**
* Write MARC bibliographic events from a separator strem, record by record, wrapped into a
* pair of {@code collection} elements.
* @param bufferSize buffer size
* @throws IOException if writing fails
*/
public void writeRecordCollection() throws IOException {
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream()), true);
public void writeRecordCollection(int bufferSize) throws IOException {
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize), true);
}
public void writeRecords() throws IOException {
writeRecords(DEFAULT_BUFFER_SIZE);
}
/**
* Write MARC bibliographic events from a separator strem, record by record.
* @param bufferSize buffer size for separator input stream
* @throws IOException if writing fails
*/
public void writeRecords() throws IOException {
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream()), false);
public void writeRecords(int bufferSize) throws IOException {
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize), false);
}
/**
@ -551,8 +583,11 @@ public final class Marc {
*/
public static class MarcIso2709Reader extends MarcXmlReader {
private MarcIso2709Reader(Builder builder) {
private final int bufferSize;
private MarcIso2709Reader(Builder builder, int bufferSize) {
super(builder);
this.bufferSize = bufferSize;
}
@Override
@ -560,7 +595,8 @@ public final class Marc {
if (input.getByteStream() == null) {
throw new IllegalArgumentException("no input stream found");
}
try (BufferedSeparatorInputStream stream = new BufferedSeparatorInputStream(input.getByteStream())) {
try (BufferedSeparatorInputStream stream =
new BufferedSeparatorInputStream(input.getByteStream(), bufferSize)) {
MarcGenerator marcGenerator = builder.createGenerator();
Chunk<byte[], BytesReference> chunk;
while ((chunk = stream.readChunk()) != null) {
@ -895,10 +931,11 @@ public final class Marc {
/**
* Create an ISO 2709 stream.
* @param bufferSize buffer size
* @return ISO 2709 stream
*/
public BufferedSeparatorInputStream iso2709Stream() {
return new BufferedSeparatorInputStream(inputStream);
public BufferedSeparatorInputStream iso2709Stream(int bufferSize) {
return new BufferedSeparatorInputStream(inputStream, bufferSize);
}
/**
@ -1088,7 +1125,7 @@ public final class Marc {
*/
public Iterator<MarcRecord> recordIterator() {
if (stream == null) {
this.stream = new BufferedSeparatorInputStream(inputStream);
this.stream = new BufferedSeparatorInputStream(inputStream, DEFAULT_BUFFER_SIZE);
}
if (marcGenerator == null) {
this.marcGenerator = createGenerator();

View file

@ -43,12 +43,24 @@ public class AlephSequentialInputStream extends PatternInputStream {
private String alephSysNumber;
public AlephSequentialInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
super(in, pattern);
public AlephSequentialInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator) {
this(in, pattern, marcGenerator, 8192);
}
public AlephSequentialInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator;
this.bytesStreamOutput = new BytesStreamOutput();
// this format might come without a record label, create a default one
this.label = RecordLabel.builder().setIndicatorLength(2).setSubfieldIdentifierLength(1).build();
this.label = RecordLabel.builder()
.setIndicatorLength(2)
.setSubfieldIdentifierLength(1)
.build();
}
@Override

View file

@ -53,8 +53,17 @@ public class BiblioMondoInputStream extends PatternInputStream {
private final BytesStreamOutput bytesStreamOutput;
public BiblioMondoInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
super(in, pattern);
public BiblioMondoInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator) {
this(in, pattern, marcGenerator, 8192);
}
public BiblioMondoInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator;
this.bytesStreamOutput = new BytesStreamOutput();
}

View file

@ -50,8 +50,19 @@ public class MabDisketteInputStream extends PatternInputStream {
this(in, pattern, '\u0000', marcGenerator);
}
public MabDisketteInputStream(InputStream in, byte[] pattern, char subfieldDelimiter, MarcGenerator marcGenerator) {
super(in, pattern);
public MabDisketteInputStream(InputStream in,
byte[] pattern,
char subfieldDelimiter,
MarcGenerator marcGenerator) {
this(in, pattern, subfieldDelimiter, marcGenerator, 8192);
}
public MabDisketteInputStream(InputStream in,
byte[] pattern,
char subfieldDelimiter,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator;
this.subfieldDelimiter = subfieldDelimiter;
this.bytesStreamOutput = new BytesStreamOutput();

View file

@ -50,8 +50,17 @@ public class PicaInputStream extends PatternInputStream {
private final BytesStreamOutput bytesStreamOutput;
public PicaInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
super(in, pattern);
public PicaInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator) {
this(in, pattern, marcGenerator, 8192);
}
public PicaInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator;
this.bytesStreamOutput = new BytesStreamOutput();
}

View file

@ -52,8 +52,17 @@ public class PicaPlainInputStream extends PatternInputStream {
private boolean started;
public PicaPlainInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
super(in, pattern);
public PicaPlainInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator) {
this(in, pattern, marcGenerator, 8192);
}
public PicaPlainInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator;
this.bytesStreamOutput = new BytesStreamOutput();
this.started = true;

View file

@ -72,14 +72,24 @@ public class SisisInputStream extends PatternInputStream {
private boolean labelEmitted;
public SisisInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator) {
this(in, pattern, marcGenerator, 8192);
}
/**
* Create a SISIS input stream.
* @param in the underlying input stream
* @param pattern the pattern for the separator
* @param marcGenerator a MARC generator
* @param bufferSize buffer size
*/
public SisisInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
super(in, pattern);
public SisisInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator;
this.bytesStreamOutput = new BytesStreamOutput();
// this format comes without a record label, create a default one

View file

@ -45,14 +45,6 @@ abstract class BaseChunkStream extends BufferedInputStream implements ChunkStrea
int buffersize;
/**
* Create a base chunk stream.
* @param in the underlying input stream
*/
BaseChunkStream(InputStream in) {
this(in, DEFAULT_BUFFER_SIZE);
}
/**
* Create a base chunk stream.
* @param in the underlying input stream

View file

@ -70,9 +70,10 @@ public class BufferedSeparatorInputStream extends BaseChunkStream {
/**
* Create a buffered information separator stream.
* @param in the underlying input stream
* @param bufferSize the buffer size
*/
public BufferedSeparatorInputStream(InputStream in) {
super(in);
public BufferedSeparatorInputStream(InputStream in, int bufferSize) {
super(in, bufferSize);
}
@Override

View file

@ -39,9 +39,10 @@ public class PatternInputStream extends BaseChunkStream {
* Create a pattern delimited input stream.
* @param in the underlying input stream
* @param pattern the pattern
* @param bufferSize buffer size
*/
public PatternInputStream(InputStream in, byte[] pattern) {
super(in);
public PatternInputStream(InputStream in, byte[] pattern, int bufferSize) {
super(in, bufferSize);
requireNonNull(pattern);
this.pattern = pattern.clone();
}
@ -49,19 +50,21 @@ public class PatternInputStream extends BaseChunkStream {
/**
* Convenience method to cerate a line-feed pattern separated input stream.
* @param in the input stream to wrap
* @param bufferSize buffer size
* @return the pattern input stream
*/
public static PatternInputStream lf(InputStream in) {
return new PatternInputStream(in, LF);
public static PatternInputStream lf(InputStream in, int bufferSize) {
return new PatternInputStream(in, LF, bufferSize);
}
/**
* Convenience method to cerate a carriage-return/line-feed pattern separated input stream.
* @param in the input stream to wrap
* @param bufferSize buffer size
* @return the pattern input stream
*/
public static PatternInputStream crlf(InputStream in) {
return new PatternInputStream(in, CRLF);
public static PatternInputStream crlf(InputStream in, int bufferSize) {
return new PatternInputStream(in, CRLF, bufferSize);
}
/**

View file

@ -19,6 +19,7 @@ package org.xbib.marc.tools;
import org.xbib.marc.Marc;
import org.xbib.marc.xml.MarcXchangeWriter;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.Charset;
@ -101,9 +102,10 @@ public class MarcTool {
}
if ("marc2xml".equals(mode)) {
try (InputStream in = Files.newInputStream(Paths.get(input));
BufferedInputStream bufferedInputStream = new BufferedInputStream(in, 65536);
MarcXchangeWriter writer = new MarcXchangeWriter(Files.newBufferedWriter(Paths.get(output)), true)) {
Marc.Builder builder = Marc.builder()
.setInputStream(in)
.setInputStream(bufferedInputStream)
.setCharset(Charset.forName(charset))
.setMarcListener(writer);
if (schema != null && stylesheet != null && result != null) {
@ -111,7 +113,7 @@ public class MarcTool {
builder.setSchema(schema).build().transform(new URL(stylesheet),
new StreamResult(Files.newBufferedWriter(Paths.get(result))));
} else {
builder.build().writeCollection();
builder.build().writeCollection(65536);
}
} catch (Exception e) {
logger.log(Level.SEVERE, e.getMessage(), e);

View file

@ -9,6 +9,7 @@ import org.xbib.marc.MarcRecord;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.atomic.AtomicInteger;
/**
@ -26,7 +27,7 @@ public class HBZTest {
try (InputStream in = getClass().getResource(file).openStream()) {
Marc marc = Marc.builder()
.setInputStream(in)
.setCharset(Charset.forName("UTF-8"))
.setCharset(StandardCharsets.UTF_8)
.build();
marc.iso2709Stream().chunks().forEach(chunk -> {
count.incrementAndGet();

View file

@ -323,7 +323,7 @@ public class MabTest {
assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + "-mapped.xml").openStream()));
}
private class LOWTransformer implements MarcValueTransformer {
private static class LOWTransformer implements MarcValueTransformer {
@Override
public String transform(String value) {

View file

@ -10,6 +10,7 @@ import org.xbib.marc.label.RecordLabel;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.atomic.AtomicInteger;
/**
@ -29,7 +30,7 @@ public class OBVSGTest {
try (InputStream in = getClass().getResource(file).openStream()) {
Marc marc = Marc.builder()
.setInputStream(in)
.setCharset(Charset.forName("UTF-8"))
.setCharset(StandardCharsets.UTF_8)
.build();
marc.iso2709Stream().chunks().forEach(chunk -> {
count.incrementAndGet();

View file

@ -83,7 +83,8 @@ public class BufferedSeparatorInputStreamTest {
};
String s = "sequential.groupstream";
InputStream in = getClass().getResource(s).openStream();
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(in, 8192);
Chunk<byte[], BytesReference> chunk;
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
listener.chunk(chunk);
@ -101,7 +102,8 @@ public class BufferedSeparatorInputStreamTest {
String s = "sequential.groupstream";
InputStream in = getClass().getResource(s).openStream();
final AtomicInteger count = new AtomicInteger(0);
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(in, 8192);
ChunkListener<byte[], BytesReference> chunkListener = (chunk) -> count.incrementAndGet();
Chunk<byte[], BytesReference> chunk;
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
@ -117,7 +119,8 @@ public class BufferedSeparatorInputStreamTest {
Map<Integer, Integer> map2 = new LinkedHashMap<>();
InputStream in2 = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream();
final AtomicInteger count2 = new AtomicInteger(0);
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in2);
BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(in2, 8192);
ChunkListener<byte[], BytesReference> chunkListener2 =
(chunk2) -> map2.put(count2.incrementAndGet(), chunk2.data().length());
Chunk<byte[], BytesReference> chunk2;
@ -151,7 +154,8 @@ public class BufferedSeparatorInputStreamTest {
public void testChunkCount() throws Exception {
String s = "periouni.mrc";
InputStream in = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream();
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(in, 8192);
long l = bufferedSeparatorInputStream.chunks().count();
assertEquals(192247, l);
}
@ -162,7 +166,8 @@ public class BufferedSeparatorInputStreamTest {
Map<Integer, Integer> map = new LinkedHashMap<>();
InputStream in = getClass().getResource("/org/xbib/marc/" + s).openStream();
final AtomicInteger count = new AtomicInteger(0);
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(in, 8192);
ChunkListener<byte[], BytesReference> chunkListener =
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
Chunk<byte[], BytesReference> chunk;
@ -174,5 +179,4 @@ public class BufferedSeparatorInputStreamTest {
+ "31=2, 32=9, 33=9, 34=2, 35=6, 36=9, 37=0}", map.toString());
in.close();
}
}

View file

@ -38,7 +38,7 @@ public class PatternInputStreamTest {
byte[] b = "Hello\nWorld".getBytes(StandardCharsets.UTF_8);
Map<Integer, Integer> map = new LinkedHashMap<>();
final AtomicInteger count = new AtomicInteger(0);
PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b));
PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b), 1024);
ChunkListener<byte[], BytesReference> chunkListener =
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
Chunk<byte[], BytesReference> chunk;
@ -55,7 +55,7 @@ public class PatternInputStreamTest {
byte[] b = "Hello\r\nWorld".getBytes(StandardCharsets.UTF_8);
Map<Integer, Integer> map = new LinkedHashMap<>();
final AtomicInteger count = new AtomicInteger(0);
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(b));
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(b), 1024);
ChunkListener<byte[], BytesReference> chunkListener =
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
Chunk<byte[], BytesReference> chunk;
@ -74,7 +74,8 @@ public class PatternInputStreamTest {
"Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8)
};
for (byte[] b : bytes) {
PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b));
PatternInputStream separatorStream =
PatternInputStream.lf(new ByteArrayInputStream(b), 8192);
long l = separatorStream.chunks().count();
separatorStream.close();
assertEquals(2L, l);
@ -89,7 +90,8 @@ public class PatternInputStreamTest {
}
Map<Integer, Integer> map = new LinkedHashMap<>();
final AtomicInteger count = new AtomicInteger(0);
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()));
PatternInputStream separatorStream =
PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()), 8192);
ChunkListener<byte[], BytesReference> chunkListener =
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
Chunk<byte[], BytesReference> chunk;
@ -108,7 +110,8 @@ public class PatternInputStreamTest {
output.write("Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8));
}
final AtomicInteger count = new AtomicInteger(0);
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()));
PatternInputStream separatorStream =
PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()), 8192);
separatorStream.chunks().forEach(chunk -> {
count.incrementAndGet();
assertEquals(5, chunk.data().length());

View file

@ -0,0 +1,102 @@
package org.xbib.marc.io;
import org.junit.Ignore;
import org.junit.Test;
import org.marc4j.MarcPermissiveStreamReader;
import org.marc4j.MarcReader;
import org.marc4j.marc.Record;
import org.xbib.marc.Marc;
import org.xbib.marc.MarcRecord;
import org.xbib.marc.MarcRecordListener;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
@Ignore
public class UncompressLargeFileTest {
private static final Logger logger = Logger.getLogger(UncompressLargeFileTest.class.getName());
@Test
public void uncompress() throws IOException {
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
byte[] buffer = new byte[1024 * 1024];
int length;
while ((length = gzipInputStream.read(buffer)) != -1) {
// do nothing
}
gzipInputStream.close();
inputStream.close();
}
@Test
public void uncompressAndDecodeChunks() throws Exception {
logger.log(Level.INFO, "start decoding chunks");
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
final AtomicInteger counter = new AtomicInteger(0);
BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(gzipInputStream, 1024 * 1024);
ChunkListener<byte[], BytesReference> chunkListener = (chunk) -> counter.incrementAndGet();
Chunk<byte[], BytesReference> chunk;
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
chunkListener.chunk(chunk);
}
gzipInputStream.close();
logger.log(Level.INFO, "stop decoding chunks, counter = " + counter.get());
}
@Test
public void uncompressAndDecodeMarcRecords() throws IOException {
logger.log(Level.INFO, "start decoding MARC");
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
final AtomicInteger counter = new AtomicInteger(0);
Marc.builder()
.setInputStream(gzipInputStream)
.setMarcRecordListener(new MarcRecordListener() {
@Override
public void beginCollection() {
}
@Override
public void record(MarcRecord marcRecord) {
counter.incrementAndGet();
}
@Override
public void endCollection() {
}
})
.setCharset(StandardCharsets.UTF_8)
.build()
.writeRecords(1024 * 1024);
gzipInputStream.close();
inputStream.close();
logger.log(Level.INFO, "stop deocding MARC, counter = " + counter.get());
}
@Test
public void uncompressAndDecodeWithMarc4j() throws Exception {
logger.log(Level.INFO, "start decoding MARC4J");
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
final AtomicInteger counter = new AtomicInteger(0);
MarcReader reader = new MarcPermissiveStreamReader(gzipInputStream, true, true);
while (reader.hasNext()) {
Record record = reader.next();
counter.incrementAndGet();
// do nothing
}
gzipInputStream.close();
inputStream.close();
logger.log(Level.INFO, "stop deocding MARC4J, counter = " + counter.get());
}
}