add buffer size parameter, marc4j comparison test

This commit is contained in:
Jörg Prante 2020-05-02 18:49:19 +02:00
parent c8b8955303
commit b123d27567
19 changed files with 272 additions and 64 deletions

View file

@ -27,6 +27,7 @@ dependencies {
testImplementation("org.mockito:mockito-core:${project.property('mockito.version')}") { testImplementation("org.mockito:mockito-core:${project.property('mockito.version')}") {
exclude group: 'org.hamcrest' exclude group: 'org.hamcrest'
} }
testImplementation "org.marc4j:marc4j:${project.property('marc4j.version')}"
} }
compileJava { compileJava {

View file

@ -1,9 +1,9 @@
group = org.xbib group = org.xbib
name = marc name = marc
version = 2.1.0 version = 2.2.0
# main # main
xbib-content.version = 2.0.4 xbib-content.version = 2.0.5
# runtime # runtime
xbib-bibliographic-character-sets.version = 1.0.0 xbib-bibliographic-character-sets.version = 1.0.0
@ -15,3 +15,4 @@ xalan.version = 2.7.2
xmlunit-matchers.version = 2.6.3 xmlunit-matchers.version = 2.6.3
system-rules.version = 1.19.0 system-rules.version = 1.19.0
mockito.version = 3.1.0 mockito.version = 3.1.0
marc4j.version = 2.9.1

View file

@ -87,6 +87,8 @@ public final class Marc {
private static final byte[] CRLF = { '\r', '\n'}; private static final byte[] CRLF = { '\r', '\n'};
private static final int DEFAULT_BUFFER_SIZE = 8192;
private final Builder builder; private final Builder builder;
private Marc(Builder builder) { private Marc(Builder builder) {
@ -101,12 +103,17 @@ public final class Marc {
return new Builder(); return new Builder();
} }
public MarcIso2709Reader iso2709XmlReader() {
return iso2709XmlReader(DEFAULT_BUFFER_SIZE);
}
/** /**
* Return an XML reader on a ISO 2709 input stream. * Return an XML reader on a ISO 2709 input stream.
* @param bufferSize buffer size for input stream
* @return XML reader * @return XML reader
*/ */
public MarcIso2709Reader iso2709XmlReader() { public MarcIso2709Reader iso2709XmlReader(int bufferSize) {
return new MarcIso2709Reader(builder); return new MarcIso2709Reader(builder, bufferSize);
} }
/** /**
@ -137,12 +144,17 @@ public final class Marc {
xmlEventReader.close(); xmlEventReader.close();
} }
public BufferedSeparatorInputStream iso2709Stream() {
return iso2709Stream(DEFAULT_BUFFER_SIZE);
}
/** /**
* Return ISO 2709 stream. * Return ISO 2709 stream.
* @param bufferSize buffer size
* @return ISO 2709 stream * @return ISO 2709 stream
*/ */
public BufferedSeparatorInputStream iso2709Stream() { public BufferedSeparatorInputStream iso2709Stream(int bufferSize) {
return builder.iso2709Stream(); return builder.iso2709Stream(bufferSize);
} }
/** /**
@ -224,7 +236,8 @@ public final class Marc {
* @throws IOException if parsing fails * @throws IOException if parsing fails
*/ */
public Document document() throws IOException { public Document document() throws IOException {
return new Sax2Dom(iso2709XmlReader(), new InputSource(builder.getInputStream())).document(); return new Sax2Dom(iso2709XmlReader(DEFAULT_BUFFER_SIZE),
new InputSource(builder.getInputStream())).document();
} }
/** /**
@ -260,16 +273,25 @@ public final class Marc {
} }
} }
public void writeCollection() throws IOException {
writeCollection(DEFAULT_BUFFER_SIZE);
}
/** /**
* Write MARC bibliographic data from seperator stream chunk by chunk to a MARC collection. * Write MARC bibliographic data from seperator stream chunk by chunk to a MARC collection.
* @param bufferSize buffer size for the separator input stream
* @throws IOException if writing fails * @throws IOException if writing fails
*/ */
public void writeCollection() throws IOException { public void writeCollection(int bufferSize) throws IOException {
wrapIntoCollection(new BufferedSeparatorInputStream(builder.getInputStream())); wrapIntoCollection(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize));
} }
public void writeCollection(String type) throws IOException { public void writeCollection(String type) throws IOException {
wrapIntoCollection(type, new BufferedSeparatorInputStream(builder.getInputStream())); writeCollection(type, DEFAULT_BUFFER_SIZE);
}
public void writeCollection(String type, int bufferSize) throws IOException {
wrapIntoCollection(type, new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize));
} }
public int wrapIntoCollection(ChunkStream<byte[], BytesReference> stream) throws IOException { public int wrapIntoCollection(ChunkStream<byte[], BytesReference> stream) throws IOException {
@ -331,21 +353,31 @@ public final class Marc {
return count; return count;
} }
public void writeRecordCollection() throws IOException {
writeRecordCollection(DEFAULT_BUFFER_SIZE);
}
/** /**
* Write MARC bibliographic events from a separator strem, record by record, wrapped into a * Write MARC bibliographic events from a separator strem, record by record, wrapped into a
* pair of {@code collection} elements. * pair of {@code collection} elements.
* @param bufferSize buffer size
* @throws IOException if writing fails * @throws IOException if writing fails
*/ */
public void writeRecordCollection() throws IOException { public void writeRecordCollection(int bufferSize) throws IOException {
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream()), true); wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize), true);
}
public void writeRecords() throws IOException {
writeRecords(DEFAULT_BUFFER_SIZE);
} }
/** /**
* Write MARC bibliographic events from a separator strem, record by record. * Write MARC bibliographic events from a separator strem, record by record.
* @param bufferSize buffer size for separator input stream
* @throws IOException if writing fails * @throws IOException if writing fails
*/ */
public void writeRecords() throws IOException { public void writeRecords(int bufferSize) throws IOException {
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream()), false); wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize), false);
} }
/** /**
@ -551,8 +583,11 @@ public final class Marc {
*/ */
public static class MarcIso2709Reader extends MarcXmlReader { public static class MarcIso2709Reader extends MarcXmlReader {
private MarcIso2709Reader(Builder builder) { private final int bufferSize;
private MarcIso2709Reader(Builder builder, int bufferSize) {
super(builder); super(builder);
this.bufferSize = bufferSize;
} }
@Override @Override
@ -560,7 +595,8 @@ public final class Marc {
if (input.getByteStream() == null) { if (input.getByteStream() == null) {
throw new IllegalArgumentException("no input stream found"); throw new IllegalArgumentException("no input stream found");
} }
try (BufferedSeparatorInputStream stream = new BufferedSeparatorInputStream(input.getByteStream())) { try (BufferedSeparatorInputStream stream =
new BufferedSeparatorInputStream(input.getByteStream(), bufferSize)) {
MarcGenerator marcGenerator = builder.createGenerator(); MarcGenerator marcGenerator = builder.createGenerator();
Chunk<byte[], BytesReference> chunk; Chunk<byte[], BytesReference> chunk;
while ((chunk = stream.readChunk()) != null) { while ((chunk = stream.readChunk()) != null) {
@ -895,10 +931,11 @@ public final class Marc {
/** /**
* Create an ISO 2709 stream. * Create an ISO 2709 stream.
* @param bufferSize buffer size
* @return ISO 2709 stream * @return ISO 2709 stream
*/ */
public BufferedSeparatorInputStream iso2709Stream() { public BufferedSeparatorInputStream iso2709Stream(int bufferSize) {
return new BufferedSeparatorInputStream(inputStream); return new BufferedSeparatorInputStream(inputStream, bufferSize);
} }
/** /**
@ -1088,7 +1125,7 @@ public final class Marc {
*/ */
public Iterator<MarcRecord> recordIterator() { public Iterator<MarcRecord> recordIterator() {
if (stream == null) { if (stream == null) {
this.stream = new BufferedSeparatorInputStream(inputStream); this.stream = new BufferedSeparatorInputStream(inputStream, DEFAULT_BUFFER_SIZE);
} }
if (marcGenerator == null) { if (marcGenerator == null) {
this.marcGenerator = createGenerator(); this.marcGenerator = createGenerator();

View file

@ -43,12 +43,24 @@ public class AlephSequentialInputStream extends PatternInputStream {
private String alephSysNumber; private String alephSysNumber;
public AlephSequentialInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) { public AlephSequentialInputStream(InputStream in,
super(in, pattern); byte[] pattern,
MarcGenerator marcGenerator) {
this(in, pattern, marcGenerator, 8192);
}
public AlephSequentialInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator; this.marcGenerator = marcGenerator;
this.bytesStreamOutput = new BytesStreamOutput(); this.bytesStreamOutput = new BytesStreamOutput();
// this format might come without a record label, create a default one // this format might come without a record label, create a default one
this.label = RecordLabel.builder().setIndicatorLength(2).setSubfieldIdentifierLength(1).build(); this.label = RecordLabel.builder()
.setIndicatorLength(2)
.setSubfieldIdentifierLength(1)
.build();
} }
@Override @Override

View file

@ -53,8 +53,17 @@ public class BiblioMondoInputStream extends PatternInputStream {
private final BytesStreamOutput bytesStreamOutput; private final BytesStreamOutput bytesStreamOutput;
public BiblioMondoInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) { public BiblioMondoInputStream(InputStream in,
super(in, pattern); byte[] pattern,
MarcGenerator marcGenerator) {
this(in, pattern, marcGenerator, 8192);
}
public BiblioMondoInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator; this.marcGenerator = marcGenerator;
this.bytesStreamOutput = new BytesStreamOutput(); this.bytesStreamOutput = new BytesStreamOutput();
} }

View file

@ -50,8 +50,19 @@ public class MabDisketteInputStream extends PatternInputStream {
this(in, pattern, '\u0000', marcGenerator); this(in, pattern, '\u0000', marcGenerator);
} }
public MabDisketteInputStream(InputStream in, byte[] pattern, char subfieldDelimiter, MarcGenerator marcGenerator) { public MabDisketteInputStream(InputStream in,
super(in, pattern); byte[] pattern,
char subfieldDelimiter,
MarcGenerator marcGenerator) {
this(in, pattern, subfieldDelimiter, marcGenerator, 8192);
}
public MabDisketteInputStream(InputStream in,
byte[] pattern,
char subfieldDelimiter,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator; this.marcGenerator = marcGenerator;
this.subfieldDelimiter = subfieldDelimiter; this.subfieldDelimiter = subfieldDelimiter;
this.bytesStreamOutput = new BytesStreamOutput(); this.bytesStreamOutput = new BytesStreamOutput();

View file

@ -50,8 +50,17 @@ public class PicaInputStream extends PatternInputStream {
private final BytesStreamOutput bytesStreamOutput; private final BytesStreamOutput bytesStreamOutput;
public PicaInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) { public PicaInputStream(InputStream in,
super(in, pattern); byte[] pattern,
MarcGenerator marcGenerator) {
this(in, pattern, marcGenerator, 8192);
}
public PicaInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator; this.marcGenerator = marcGenerator;
this.bytesStreamOutput = new BytesStreamOutput(); this.bytesStreamOutput = new BytesStreamOutput();
} }

View file

@ -52,8 +52,17 @@ public class PicaPlainInputStream extends PatternInputStream {
private boolean started; private boolean started;
public PicaPlainInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) { public PicaPlainInputStream(InputStream in,
super(in, pattern); byte[] pattern,
MarcGenerator marcGenerator) {
this(in, pattern, marcGenerator, 8192);
}
public PicaPlainInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator; this.marcGenerator = marcGenerator;
this.bytesStreamOutput = new BytesStreamOutput(); this.bytesStreamOutput = new BytesStreamOutput();
this.started = true; this.started = true;

View file

@ -72,14 +72,24 @@ public class SisisInputStream extends PatternInputStream {
private boolean labelEmitted; private boolean labelEmitted;
public SisisInputStream(InputStream in,
byte[] pattern,
MarcGenerator marcGenerator) {
this(in, pattern, marcGenerator, 8192);
}
/** /**
* Create a SISIS input stream. * Create a SISIS input stream.
* @param in the underlying input stream * @param in the underlying input stream
* @param pattern the pattern for the separator * @param pattern the pattern for the separator
* @param marcGenerator a MARC generator * @param marcGenerator a MARC generator
* @param bufferSize buffer size
*/ */
public SisisInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) { public SisisInputStream(InputStream in,
super(in, pattern); byte[] pattern,
MarcGenerator marcGenerator,
int bufferSize) {
super(in, pattern, bufferSize);
this.marcGenerator = marcGenerator; this.marcGenerator = marcGenerator;
this.bytesStreamOutput = new BytesStreamOutput(); this.bytesStreamOutput = new BytesStreamOutput();
// this format comes without a record label, create a default one // this format comes without a record label, create a default one

View file

@ -45,14 +45,6 @@ abstract class BaseChunkStream extends BufferedInputStream implements ChunkStrea
int buffersize; int buffersize;
/**
* Create a base chunk stream.
* @param in the underlying input stream
*/
BaseChunkStream(InputStream in) {
this(in, DEFAULT_BUFFER_SIZE);
}
/** /**
* Create a base chunk stream. * Create a base chunk stream.
* @param in the underlying input stream * @param in the underlying input stream

View file

@ -70,9 +70,10 @@ public class BufferedSeparatorInputStream extends BaseChunkStream {
/** /**
* Create a buffered information separator stream. * Create a buffered information separator stream.
* @param in the underlying input stream * @param in the underlying input stream
* @param bufferSize the buffer size
*/ */
public BufferedSeparatorInputStream(InputStream in) { public BufferedSeparatorInputStream(InputStream in, int bufferSize) {
super(in); super(in, bufferSize);
} }
@Override @Override

View file

@ -39,9 +39,10 @@ public class PatternInputStream extends BaseChunkStream {
* Create a pattern delimited input stream. * Create a pattern delimited input stream.
* @param in the underlying input stream * @param in the underlying input stream
* @param pattern the pattern * @param pattern the pattern
* @param bufferSize buffer size
*/ */
public PatternInputStream(InputStream in, byte[] pattern) { public PatternInputStream(InputStream in, byte[] pattern, int bufferSize) {
super(in); super(in, bufferSize);
requireNonNull(pattern); requireNonNull(pattern);
this.pattern = pattern.clone(); this.pattern = pattern.clone();
} }
@ -49,19 +50,21 @@ public class PatternInputStream extends BaseChunkStream {
/** /**
* Convenience method to cerate a line-feed pattern separated input stream. * Convenience method to cerate a line-feed pattern separated input stream.
* @param in the input stream to wrap * @param in the input stream to wrap
* @param bufferSize buffer size
* @return the pattern input stream * @return the pattern input stream
*/ */
public static PatternInputStream lf(InputStream in) { public static PatternInputStream lf(InputStream in, int bufferSize) {
return new PatternInputStream(in, LF); return new PatternInputStream(in, LF, bufferSize);
} }
/** /**
* Convenience method to cerate a carriage-return/line-feed pattern separated input stream. * Convenience method to cerate a carriage-return/line-feed pattern separated input stream.
* @param in the input stream to wrap * @param in the input stream to wrap
* @param bufferSize buffer size
* @return the pattern input stream * @return the pattern input stream
*/ */
public static PatternInputStream crlf(InputStream in) { public static PatternInputStream crlf(InputStream in, int bufferSize) {
return new PatternInputStream(in, CRLF); return new PatternInputStream(in, CRLF, bufferSize);
} }
/** /**

View file

@ -19,6 +19,7 @@ package org.xbib.marc.tools;
import org.xbib.marc.Marc; import org.xbib.marc.Marc;
import org.xbib.marc.xml.MarcXchangeWriter; import org.xbib.marc.xml.MarcXchangeWriter;
import java.io.BufferedInputStream;
import java.io.InputStream; import java.io.InputStream;
import java.net.URL; import java.net.URL;
import java.nio.charset.Charset; import java.nio.charset.Charset;
@ -101,9 +102,10 @@ public class MarcTool {
} }
if ("marc2xml".equals(mode)) { if ("marc2xml".equals(mode)) {
try (InputStream in = Files.newInputStream(Paths.get(input)); try (InputStream in = Files.newInputStream(Paths.get(input));
BufferedInputStream bufferedInputStream = new BufferedInputStream(in, 65536);
MarcXchangeWriter writer = new MarcXchangeWriter(Files.newBufferedWriter(Paths.get(output)), true)) { MarcXchangeWriter writer = new MarcXchangeWriter(Files.newBufferedWriter(Paths.get(output)), true)) {
Marc.Builder builder = Marc.builder() Marc.Builder builder = Marc.builder()
.setInputStream(in) .setInputStream(bufferedInputStream)
.setCharset(Charset.forName(charset)) .setCharset(Charset.forName(charset))
.setMarcListener(writer); .setMarcListener(writer);
if (schema != null && stylesheet != null && result != null) { if (schema != null && stylesheet != null && result != null) {
@ -111,7 +113,7 @@ public class MarcTool {
builder.setSchema(schema).build().transform(new URL(stylesheet), builder.setSchema(schema).build().transform(new URL(stylesheet),
new StreamResult(Files.newBufferedWriter(Paths.get(result)))); new StreamResult(Files.newBufferedWriter(Paths.get(result))));
} else { } else {
builder.build().writeCollection(); builder.build().writeCollection(65536);
} }
} catch (Exception e) { } catch (Exception e) {
logger.log(Level.SEVERE, e.getMessage(), e); logger.log(Level.SEVERE, e.getMessage(), e);

View file

@ -9,6 +9,7 @@ import org.xbib.marc.MarcRecord;
import java.io.InputStream; import java.io.InputStream;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
/** /**
@ -26,7 +27,7 @@ public class HBZTest {
try (InputStream in = getClass().getResource(file).openStream()) { try (InputStream in = getClass().getResource(file).openStream()) {
Marc marc = Marc.builder() Marc marc = Marc.builder()
.setInputStream(in) .setInputStream(in)
.setCharset(Charset.forName("UTF-8")) .setCharset(StandardCharsets.UTF_8)
.build(); .build();
marc.iso2709Stream().chunks().forEach(chunk -> { marc.iso2709Stream().chunks().forEach(chunk -> {
count.incrementAndGet(); count.incrementAndGet();

View file

@ -323,7 +323,7 @@ public class MabTest {
assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + "-mapped.xml").openStream())); assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + "-mapped.xml").openStream()));
} }
private class LOWTransformer implements MarcValueTransformer { private static class LOWTransformer implements MarcValueTransformer {
@Override @Override
public String transform(String value) { public String transform(String value) {

View file

@ -10,6 +10,7 @@ import org.xbib.marc.label.RecordLabel;
import java.io.InputStream; import java.io.InputStream;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
/** /**
@ -29,7 +30,7 @@ public class OBVSGTest {
try (InputStream in = getClass().getResource(file).openStream()) { try (InputStream in = getClass().getResource(file).openStream()) {
Marc marc = Marc.builder() Marc marc = Marc.builder()
.setInputStream(in) .setInputStream(in)
.setCharset(Charset.forName("UTF-8")) .setCharset(StandardCharsets.UTF_8)
.build(); .build();
marc.iso2709Stream().chunks().forEach(chunk -> { marc.iso2709Stream().chunks().forEach(chunk -> {
count.incrementAndGet(); count.incrementAndGet();

View file

@ -83,7 +83,8 @@ public class BufferedSeparatorInputStreamTest {
}; };
String s = "sequential.groupstream"; String s = "sequential.groupstream";
InputStream in = getClass().getResource(s).openStream(); InputStream in = getClass().getResource(s).openStream();
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in); BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(in, 8192);
Chunk<byte[], BytesReference> chunk; Chunk<byte[], BytesReference> chunk;
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) { while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
listener.chunk(chunk); listener.chunk(chunk);
@ -101,7 +102,8 @@ public class BufferedSeparatorInputStreamTest {
String s = "sequential.groupstream"; String s = "sequential.groupstream";
InputStream in = getClass().getResource(s).openStream(); InputStream in = getClass().getResource(s).openStream();
final AtomicInteger count = new AtomicInteger(0); final AtomicInteger count = new AtomicInteger(0);
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in); BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(in, 8192);
ChunkListener<byte[], BytesReference> chunkListener = (chunk) -> count.incrementAndGet(); ChunkListener<byte[], BytesReference> chunkListener = (chunk) -> count.incrementAndGet();
Chunk<byte[], BytesReference> chunk; Chunk<byte[], BytesReference> chunk;
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) { while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
@ -117,7 +119,8 @@ public class BufferedSeparatorInputStreamTest {
Map<Integer, Integer> map2 = new LinkedHashMap<>(); Map<Integer, Integer> map2 = new LinkedHashMap<>();
InputStream in2 = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream(); InputStream in2 = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream();
final AtomicInteger count2 = new AtomicInteger(0); final AtomicInteger count2 = new AtomicInteger(0);
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in2); BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(in2, 8192);
ChunkListener<byte[], BytesReference> chunkListener2 = ChunkListener<byte[], BytesReference> chunkListener2 =
(chunk2) -> map2.put(count2.incrementAndGet(), chunk2.data().length()); (chunk2) -> map2.put(count2.incrementAndGet(), chunk2.data().length());
Chunk<byte[], BytesReference> chunk2; Chunk<byte[], BytesReference> chunk2;
@ -151,7 +154,8 @@ public class BufferedSeparatorInputStreamTest {
public void testChunkCount() throws Exception { public void testChunkCount() throws Exception {
String s = "periouni.mrc"; String s = "periouni.mrc";
InputStream in = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream(); InputStream in = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream();
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in); BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(in, 8192);
long l = bufferedSeparatorInputStream.chunks().count(); long l = bufferedSeparatorInputStream.chunks().count();
assertEquals(192247, l); assertEquals(192247, l);
} }
@ -162,7 +166,8 @@ public class BufferedSeparatorInputStreamTest {
Map<Integer, Integer> map = new LinkedHashMap<>(); Map<Integer, Integer> map = new LinkedHashMap<>();
InputStream in = getClass().getResource("/org/xbib/marc/" + s).openStream(); InputStream in = getClass().getResource("/org/xbib/marc/" + s).openStream();
final AtomicInteger count = new AtomicInteger(0); final AtomicInteger count = new AtomicInteger(0);
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in); BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(in, 8192);
ChunkListener<byte[], BytesReference> chunkListener = ChunkListener<byte[], BytesReference> chunkListener =
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length()); (chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
Chunk<byte[], BytesReference> chunk; Chunk<byte[], BytesReference> chunk;
@ -174,5 +179,4 @@ public class BufferedSeparatorInputStreamTest {
+ "31=2, 32=9, 33=9, 34=2, 35=6, 36=9, 37=0}", map.toString()); + "31=2, 32=9, 33=9, 34=2, 35=6, 36=9, 37=0}", map.toString());
in.close(); in.close();
} }
} }

View file

@ -38,7 +38,7 @@ public class PatternInputStreamTest {
byte[] b = "Hello\nWorld".getBytes(StandardCharsets.UTF_8); byte[] b = "Hello\nWorld".getBytes(StandardCharsets.UTF_8);
Map<Integer, Integer> map = new LinkedHashMap<>(); Map<Integer, Integer> map = new LinkedHashMap<>();
final AtomicInteger count = new AtomicInteger(0); final AtomicInteger count = new AtomicInteger(0);
PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b)); PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b), 1024);
ChunkListener<byte[], BytesReference> chunkListener = ChunkListener<byte[], BytesReference> chunkListener =
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length()); (chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
Chunk<byte[], BytesReference> chunk; Chunk<byte[], BytesReference> chunk;
@ -55,7 +55,7 @@ public class PatternInputStreamTest {
byte[] b = "Hello\r\nWorld".getBytes(StandardCharsets.UTF_8); byte[] b = "Hello\r\nWorld".getBytes(StandardCharsets.UTF_8);
Map<Integer, Integer> map = new LinkedHashMap<>(); Map<Integer, Integer> map = new LinkedHashMap<>();
final AtomicInteger count = new AtomicInteger(0); final AtomicInteger count = new AtomicInteger(0);
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(b)); PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(b), 1024);
ChunkListener<byte[], BytesReference> chunkListener = ChunkListener<byte[], BytesReference> chunkListener =
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length()); (chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
Chunk<byte[], BytesReference> chunk; Chunk<byte[], BytesReference> chunk;
@ -74,7 +74,8 @@ public class PatternInputStreamTest {
"Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8) "Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8)
}; };
for (byte[] b : bytes) { for (byte[] b : bytes) {
PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b)); PatternInputStream separatorStream =
PatternInputStream.lf(new ByteArrayInputStream(b), 8192);
long l = separatorStream.chunks().count(); long l = separatorStream.chunks().count();
separatorStream.close(); separatorStream.close();
assertEquals(2L, l); assertEquals(2L, l);
@ -89,7 +90,8 @@ public class PatternInputStreamTest {
} }
Map<Integer, Integer> map = new LinkedHashMap<>(); Map<Integer, Integer> map = new LinkedHashMap<>();
final AtomicInteger count = new AtomicInteger(0); final AtomicInteger count = new AtomicInteger(0);
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes())); PatternInputStream separatorStream =
PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()), 8192);
ChunkListener<byte[], BytesReference> chunkListener = ChunkListener<byte[], BytesReference> chunkListener =
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length()); (chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
Chunk<byte[], BytesReference> chunk; Chunk<byte[], BytesReference> chunk;
@ -108,7 +110,8 @@ public class PatternInputStreamTest {
output.write("Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8)); output.write("Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8));
} }
final AtomicInteger count = new AtomicInteger(0); final AtomicInteger count = new AtomicInteger(0);
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes())); PatternInputStream separatorStream =
PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()), 8192);
separatorStream.chunks().forEach(chunk -> { separatorStream.chunks().forEach(chunk -> {
count.incrementAndGet(); count.incrementAndGet();
assertEquals(5, chunk.data().length()); assertEquals(5, chunk.data().length());

View file

@ -0,0 +1,102 @@
package org.xbib.marc.io;
import org.junit.Ignore;
import org.junit.Test;
import org.marc4j.MarcPermissiveStreamReader;
import org.marc4j.MarcReader;
import org.marc4j.marc.Record;
import org.xbib.marc.Marc;
import org.xbib.marc.MarcRecord;
import org.xbib.marc.MarcRecordListener;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
@Ignore
public class UncompressLargeFileTest {
private static final Logger logger = Logger.getLogger(UncompressLargeFileTest.class.getName());
@Test
public void uncompress() throws IOException {
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
byte[] buffer = new byte[1024 * 1024];
int length;
while ((length = gzipInputStream.read(buffer)) != -1) {
// do nothing
}
gzipInputStream.close();
inputStream.close();
}
@Test
public void uncompressAndDecodeChunks() throws Exception {
logger.log(Level.INFO, "start decoding chunks");
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
final AtomicInteger counter = new AtomicInteger(0);
BufferedSeparatorInputStream bufferedSeparatorInputStream =
new BufferedSeparatorInputStream(gzipInputStream, 1024 * 1024);
ChunkListener<byte[], BytesReference> chunkListener = (chunk) -> counter.incrementAndGet();
Chunk<byte[], BytesReference> chunk;
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
chunkListener.chunk(chunk);
}
gzipInputStream.close();
logger.log(Level.INFO, "stop decoding chunks, counter = " + counter.get());
}
@Test
public void uncompressAndDecodeMarcRecords() throws IOException {
logger.log(Level.INFO, "start decoding MARC");
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
final AtomicInteger counter = new AtomicInteger(0);
Marc.builder()
.setInputStream(gzipInputStream)
.setMarcRecordListener(new MarcRecordListener() {
@Override
public void beginCollection() {
}
@Override
public void record(MarcRecord marcRecord) {
counter.incrementAndGet();
}
@Override
public void endCollection() {
}
})
.setCharset(StandardCharsets.UTF_8)
.build()
.writeRecords(1024 * 1024);
gzipInputStream.close();
inputStream.close();
logger.log(Level.INFO, "stop deocding MARC, counter = " + counter.get());
}
@Test
public void uncompressAndDecodeWithMarc4j() throws Exception {
logger.log(Level.INFO, "start decoding MARC4J");
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
final AtomicInteger counter = new AtomicInteger(0);
MarcReader reader = new MarcPermissiveStreamReader(gzipInputStream, true, true);
while (reader.hasNext()) {
Record record = reader.next();
counter.incrementAndGet();
// do nothing
}
gzipInputStream.close();
inputStream.close();
logger.log(Level.INFO, "stop deocding MARC4J, counter = " + counter.get());
}
}