add buffer size parameter, marc4j comparison test
This commit is contained in:
parent
c8b8955303
commit
b123d27567
19 changed files with 272 additions and 64 deletions
|
@ -27,6 +27,7 @@ dependencies {
|
|||
testImplementation("org.mockito:mockito-core:${project.property('mockito.version')}") {
|
||||
exclude group: 'org.hamcrest'
|
||||
}
|
||||
testImplementation "org.marc4j:marc4j:${project.property('marc4j.version')}"
|
||||
}
|
||||
|
||||
compileJava {
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
group = org.xbib
|
||||
name = marc
|
||||
version = 2.1.0
|
||||
version = 2.2.0
|
||||
|
||||
# main
|
||||
xbib-content.version = 2.0.4
|
||||
xbib-content.version = 2.0.5
|
||||
|
||||
# runtime
|
||||
xbib-bibliographic-character-sets.version = 1.0.0
|
||||
|
@ -15,3 +15,4 @@ xalan.version = 2.7.2
|
|||
xmlunit-matchers.version = 2.6.3
|
||||
system-rules.version = 1.19.0
|
||||
mockito.version = 3.1.0
|
||||
marc4j.version = 2.9.1
|
||||
|
|
|
@ -87,6 +87,8 @@ public final class Marc {
|
|||
|
||||
private static final byte[] CRLF = { '\r', '\n'};
|
||||
|
||||
private static final int DEFAULT_BUFFER_SIZE = 8192;
|
||||
|
||||
private final Builder builder;
|
||||
|
||||
private Marc(Builder builder) {
|
||||
|
@ -101,12 +103,17 @@ public final class Marc {
|
|||
return new Builder();
|
||||
}
|
||||
|
||||
public MarcIso2709Reader iso2709XmlReader() {
|
||||
return iso2709XmlReader(DEFAULT_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an XML reader on a ISO 2709 input stream.
|
||||
* @param bufferSize buffer size for input stream
|
||||
* @return XML reader
|
||||
*/
|
||||
public MarcIso2709Reader iso2709XmlReader() {
|
||||
return new MarcIso2709Reader(builder);
|
||||
public MarcIso2709Reader iso2709XmlReader(int bufferSize) {
|
||||
return new MarcIso2709Reader(builder, bufferSize);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -137,12 +144,17 @@ public final class Marc {
|
|||
xmlEventReader.close();
|
||||
}
|
||||
|
||||
public BufferedSeparatorInputStream iso2709Stream() {
|
||||
return iso2709Stream(DEFAULT_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return ISO 2709 stream.
|
||||
* @param bufferSize buffer size
|
||||
* @return ISO 2709 stream
|
||||
*/
|
||||
public BufferedSeparatorInputStream iso2709Stream() {
|
||||
return builder.iso2709Stream();
|
||||
public BufferedSeparatorInputStream iso2709Stream(int bufferSize) {
|
||||
return builder.iso2709Stream(bufferSize);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -224,7 +236,8 @@ public final class Marc {
|
|||
* @throws IOException if parsing fails
|
||||
*/
|
||||
public Document document() throws IOException {
|
||||
return new Sax2Dom(iso2709XmlReader(), new InputSource(builder.getInputStream())).document();
|
||||
return new Sax2Dom(iso2709XmlReader(DEFAULT_BUFFER_SIZE),
|
||||
new InputSource(builder.getInputStream())).document();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -260,16 +273,25 @@ public final class Marc {
|
|||
}
|
||||
}
|
||||
|
||||
public void writeCollection() throws IOException {
|
||||
writeCollection(DEFAULT_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write MARC bibliographic data from seperator stream chunk by chunk to a MARC collection.
|
||||
* @param bufferSize buffer size for the separator input stream
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public void writeCollection() throws IOException {
|
||||
wrapIntoCollection(new BufferedSeparatorInputStream(builder.getInputStream()));
|
||||
public void writeCollection(int bufferSize) throws IOException {
|
||||
wrapIntoCollection(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize));
|
||||
}
|
||||
|
||||
public void writeCollection(String type) throws IOException {
|
||||
wrapIntoCollection(type, new BufferedSeparatorInputStream(builder.getInputStream()));
|
||||
writeCollection(type, DEFAULT_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
public void writeCollection(String type, int bufferSize) throws IOException {
|
||||
wrapIntoCollection(type, new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize));
|
||||
}
|
||||
|
||||
public int wrapIntoCollection(ChunkStream<byte[], BytesReference> stream) throws IOException {
|
||||
|
@ -331,21 +353,31 @@ public final class Marc {
|
|||
return count;
|
||||
}
|
||||
|
||||
public void writeRecordCollection() throws IOException {
|
||||
writeRecordCollection(DEFAULT_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write MARC bibliographic events from a separator strem, record by record, wrapped into a
|
||||
* pair of {@code collection} elements.
|
||||
* @param bufferSize buffer size
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public void writeRecordCollection() throws IOException {
|
||||
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream()), true);
|
||||
public void writeRecordCollection(int bufferSize) throws IOException {
|
||||
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize), true);
|
||||
}
|
||||
|
||||
public void writeRecords() throws IOException {
|
||||
writeRecords(DEFAULT_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write MARC bibliographic events from a separator strem, record by record.
|
||||
* @param bufferSize buffer size for separator input stream
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public void writeRecords() throws IOException {
|
||||
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream()), false);
|
||||
public void writeRecords(int bufferSize) throws IOException {
|
||||
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize), false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -551,8 +583,11 @@ public final class Marc {
|
|||
*/
|
||||
public static class MarcIso2709Reader extends MarcXmlReader {
|
||||
|
||||
private MarcIso2709Reader(Builder builder) {
|
||||
private final int bufferSize;
|
||||
|
||||
private MarcIso2709Reader(Builder builder, int bufferSize) {
|
||||
super(builder);
|
||||
this.bufferSize = bufferSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -560,7 +595,8 @@ public final class Marc {
|
|||
if (input.getByteStream() == null) {
|
||||
throw new IllegalArgumentException("no input stream found");
|
||||
}
|
||||
try (BufferedSeparatorInputStream stream = new BufferedSeparatorInputStream(input.getByteStream())) {
|
||||
try (BufferedSeparatorInputStream stream =
|
||||
new BufferedSeparatorInputStream(input.getByteStream(), bufferSize)) {
|
||||
MarcGenerator marcGenerator = builder.createGenerator();
|
||||
Chunk<byte[], BytesReference> chunk;
|
||||
while ((chunk = stream.readChunk()) != null) {
|
||||
|
@ -895,10 +931,11 @@ public final class Marc {
|
|||
|
||||
/**
|
||||
* Create an ISO 2709 stream.
|
||||
* @param bufferSize buffer size
|
||||
* @return ISO 2709 stream
|
||||
*/
|
||||
public BufferedSeparatorInputStream iso2709Stream() {
|
||||
return new BufferedSeparatorInputStream(inputStream);
|
||||
public BufferedSeparatorInputStream iso2709Stream(int bufferSize) {
|
||||
return new BufferedSeparatorInputStream(inputStream, bufferSize);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1088,7 +1125,7 @@ public final class Marc {
|
|||
*/
|
||||
public Iterator<MarcRecord> recordIterator() {
|
||||
if (stream == null) {
|
||||
this.stream = new BufferedSeparatorInputStream(inputStream);
|
||||
this.stream = new BufferedSeparatorInputStream(inputStream, DEFAULT_BUFFER_SIZE);
|
||||
}
|
||||
if (marcGenerator == null) {
|
||||
this.marcGenerator = createGenerator();
|
||||
|
|
|
@ -43,12 +43,24 @@ public class AlephSequentialInputStream extends PatternInputStream {
|
|||
|
||||
private String alephSysNumber;
|
||||
|
||||
public AlephSequentialInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
|
||||
super(in, pattern);
|
||||
public AlephSequentialInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
MarcGenerator marcGenerator) {
|
||||
this(in, pattern, marcGenerator, 8192);
|
||||
}
|
||||
|
||||
public AlephSequentialInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
MarcGenerator marcGenerator,
|
||||
int bufferSize) {
|
||||
super(in, pattern, bufferSize);
|
||||
this.marcGenerator = marcGenerator;
|
||||
this.bytesStreamOutput = new BytesStreamOutput();
|
||||
// this format might come without a record label, create a default one
|
||||
this.label = RecordLabel.builder().setIndicatorLength(2).setSubfieldIdentifierLength(1).build();
|
||||
this.label = RecordLabel.builder()
|
||||
.setIndicatorLength(2)
|
||||
.setSubfieldIdentifierLength(1)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -53,8 +53,17 @@ public class BiblioMondoInputStream extends PatternInputStream {
|
|||
|
||||
private final BytesStreamOutput bytesStreamOutput;
|
||||
|
||||
public BiblioMondoInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
|
||||
super(in, pattern);
|
||||
public BiblioMondoInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
MarcGenerator marcGenerator) {
|
||||
this(in, pattern, marcGenerator, 8192);
|
||||
}
|
||||
|
||||
public BiblioMondoInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
MarcGenerator marcGenerator,
|
||||
int bufferSize) {
|
||||
super(in, pattern, bufferSize);
|
||||
this.marcGenerator = marcGenerator;
|
||||
this.bytesStreamOutput = new BytesStreamOutput();
|
||||
}
|
||||
|
|
|
@ -50,8 +50,19 @@ public class MabDisketteInputStream extends PatternInputStream {
|
|||
this(in, pattern, '\u0000', marcGenerator);
|
||||
}
|
||||
|
||||
public MabDisketteInputStream(InputStream in, byte[] pattern, char subfieldDelimiter, MarcGenerator marcGenerator) {
|
||||
super(in, pattern);
|
||||
public MabDisketteInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
char subfieldDelimiter,
|
||||
MarcGenerator marcGenerator) {
|
||||
this(in, pattern, subfieldDelimiter, marcGenerator, 8192);
|
||||
}
|
||||
|
||||
public MabDisketteInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
char subfieldDelimiter,
|
||||
MarcGenerator marcGenerator,
|
||||
int bufferSize) {
|
||||
super(in, pattern, bufferSize);
|
||||
this.marcGenerator = marcGenerator;
|
||||
this.subfieldDelimiter = subfieldDelimiter;
|
||||
this.bytesStreamOutput = new BytesStreamOutput();
|
||||
|
|
|
@ -50,8 +50,17 @@ public class PicaInputStream extends PatternInputStream {
|
|||
|
||||
private final BytesStreamOutput bytesStreamOutput;
|
||||
|
||||
public PicaInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
|
||||
super(in, pattern);
|
||||
public PicaInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
MarcGenerator marcGenerator) {
|
||||
this(in, pattern, marcGenerator, 8192);
|
||||
}
|
||||
|
||||
public PicaInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
MarcGenerator marcGenerator,
|
||||
int bufferSize) {
|
||||
super(in, pattern, bufferSize);
|
||||
this.marcGenerator = marcGenerator;
|
||||
this.bytesStreamOutput = new BytesStreamOutput();
|
||||
}
|
||||
|
|
|
@ -52,8 +52,17 @@ public class PicaPlainInputStream extends PatternInputStream {
|
|||
|
||||
private boolean started;
|
||||
|
||||
public PicaPlainInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
|
||||
super(in, pattern);
|
||||
public PicaPlainInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
MarcGenerator marcGenerator) {
|
||||
this(in, pattern, marcGenerator, 8192);
|
||||
}
|
||||
|
||||
public PicaPlainInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
MarcGenerator marcGenerator,
|
||||
int bufferSize) {
|
||||
super(in, pattern, bufferSize);
|
||||
this.marcGenerator = marcGenerator;
|
||||
this.bytesStreamOutput = new BytesStreamOutput();
|
||||
this.started = true;
|
||||
|
|
|
@ -72,14 +72,24 @@ public class SisisInputStream extends PatternInputStream {
|
|||
|
||||
private boolean labelEmitted;
|
||||
|
||||
public SisisInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
MarcGenerator marcGenerator) {
|
||||
this(in, pattern, marcGenerator, 8192);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a SISIS input stream.
|
||||
* @param in the underlying input stream
|
||||
* @param pattern the pattern for the separator
|
||||
* @param marcGenerator a MARC generator
|
||||
* @param bufferSize buffer size
|
||||
*/
|
||||
public SisisInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
|
||||
super(in, pattern);
|
||||
public SisisInputStream(InputStream in,
|
||||
byte[] pattern,
|
||||
MarcGenerator marcGenerator,
|
||||
int bufferSize) {
|
||||
super(in, pattern, bufferSize);
|
||||
this.marcGenerator = marcGenerator;
|
||||
this.bytesStreamOutput = new BytesStreamOutput();
|
||||
// this format comes without a record label, create a default one
|
||||
|
|
|
@ -45,14 +45,6 @@ abstract class BaseChunkStream extends BufferedInputStream implements ChunkStrea
|
|||
|
||||
int buffersize;
|
||||
|
||||
/**
|
||||
* Create a base chunk stream.
|
||||
* @param in the underlying input stream
|
||||
*/
|
||||
BaseChunkStream(InputStream in) {
|
||||
this(in, DEFAULT_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a base chunk stream.
|
||||
* @param in the underlying input stream
|
||||
|
|
|
@ -70,9 +70,10 @@ public class BufferedSeparatorInputStream extends BaseChunkStream {
|
|||
/**
|
||||
* Create a buffered information separator stream.
|
||||
* @param in the underlying input stream
|
||||
* @param bufferSize the buffer size
|
||||
*/
|
||||
public BufferedSeparatorInputStream(InputStream in) {
|
||||
super(in);
|
||||
public BufferedSeparatorInputStream(InputStream in, int bufferSize) {
|
||||
super(in, bufferSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -39,9 +39,10 @@ public class PatternInputStream extends BaseChunkStream {
|
|||
* Create a pattern delimited input stream.
|
||||
* @param in the underlying input stream
|
||||
* @param pattern the pattern
|
||||
* @param bufferSize buffer size
|
||||
*/
|
||||
public PatternInputStream(InputStream in, byte[] pattern) {
|
||||
super(in);
|
||||
public PatternInputStream(InputStream in, byte[] pattern, int bufferSize) {
|
||||
super(in, bufferSize);
|
||||
requireNonNull(pattern);
|
||||
this.pattern = pattern.clone();
|
||||
}
|
||||
|
@ -49,19 +50,21 @@ public class PatternInputStream extends BaseChunkStream {
|
|||
/**
|
||||
* Convenience method to cerate a line-feed pattern separated input stream.
|
||||
* @param in the input stream to wrap
|
||||
* @param bufferSize buffer size
|
||||
* @return the pattern input stream
|
||||
*/
|
||||
public static PatternInputStream lf(InputStream in) {
|
||||
return new PatternInputStream(in, LF);
|
||||
public static PatternInputStream lf(InputStream in, int bufferSize) {
|
||||
return new PatternInputStream(in, LF, bufferSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method to cerate a carriage-return/line-feed pattern separated input stream.
|
||||
* @param in the input stream to wrap
|
||||
* @param bufferSize buffer size
|
||||
* @return the pattern input stream
|
||||
*/
|
||||
public static PatternInputStream crlf(InputStream in) {
|
||||
return new PatternInputStream(in, CRLF);
|
||||
public static PatternInputStream crlf(InputStream in, int bufferSize) {
|
||||
return new PatternInputStream(in, CRLF, bufferSize);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.xbib.marc.tools;
|
|||
import org.xbib.marc.Marc;
|
||||
import org.xbib.marc.xml.MarcXchangeWriter;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
|
@ -101,9 +102,10 @@ public class MarcTool {
|
|||
}
|
||||
if ("marc2xml".equals(mode)) {
|
||||
try (InputStream in = Files.newInputStream(Paths.get(input));
|
||||
BufferedInputStream bufferedInputStream = new BufferedInputStream(in, 65536);
|
||||
MarcXchangeWriter writer = new MarcXchangeWriter(Files.newBufferedWriter(Paths.get(output)), true)) {
|
||||
Marc.Builder builder = Marc.builder()
|
||||
.setInputStream(in)
|
||||
.setInputStream(bufferedInputStream)
|
||||
.setCharset(Charset.forName(charset))
|
||||
.setMarcListener(writer);
|
||||
if (schema != null && stylesheet != null && result != null) {
|
||||
|
@ -111,7 +113,7 @@ public class MarcTool {
|
|||
builder.setSchema(schema).build().transform(new URL(stylesheet),
|
||||
new StreamResult(Files.newBufferedWriter(Paths.get(result))));
|
||||
} else {
|
||||
builder.build().writeCollection();
|
||||
builder.build().writeCollection(65536);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.log(Level.SEVERE, e.getMessage(), e);
|
||||
|
|
|
@ -9,6 +9,7 @@ import org.xbib.marc.MarcRecord;
|
|||
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
|
@ -26,7 +27,7 @@ public class HBZTest {
|
|||
try (InputStream in = getClass().getResource(file).openStream()) {
|
||||
Marc marc = Marc.builder()
|
||||
.setInputStream(in)
|
||||
.setCharset(Charset.forName("UTF-8"))
|
||||
.setCharset(StandardCharsets.UTF_8)
|
||||
.build();
|
||||
marc.iso2709Stream().chunks().forEach(chunk -> {
|
||||
count.incrementAndGet();
|
||||
|
|
|
@ -323,7 +323,7 @@ public class MabTest {
|
|||
assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + "-mapped.xml").openStream()));
|
||||
}
|
||||
|
||||
private class LOWTransformer implements MarcValueTransformer {
|
||||
private static class LOWTransformer implements MarcValueTransformer {
|
||||
|
||||
@Override
|
||||
public String transform(String value) {
|
||||
|
|
|
@ -10,6 +10,7 @@ import org.xbib.marc.label.RecordLabel;
|
|||
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
|
@ -29,7 +30,7 @@ public class OBVSGTest {
|
|||
try (InputStream in = getClass().getResource(file).openStream()) {
|
||||
Marc marc = Marc.builder()
|
||||
.setInputStream(in)
|
||||
.setCharset(Charset.forName("UTF-8"))
|
||||
.setCharset(StandardCharsets.UTF_8)
|
||||
.build();
|
||||
marc.iso2709Stream().chunks().forEach(chunk -> {
|
||||
count.incrementAndGet();
|
||||
|
|
|
@ -83,7 +83,8 @@ public class BufferedSeparatorInputStreamTest {
|
|||
};
|
||||
String s = "sequential.groupstream";
|
||||
InputStream in = getClass().getResource(s).openStream();
|
||||
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
|
||||
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||
new BufferedSeparatorInputStream(in, 8192);
|
||||
Chunk<byte[], BytesReference> chunk;
|
||||
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
|
||||
listener.chunk(chunk);
|
||||
|
@ -101,7 +102,8 @@ public class BufferedSeparatorInputStreamTest {
|
|||
String s = "sequential.groupstream";
|
||||
InputStream in = getClass().getResource(s).openStream();
|
||||
final AtomicInteger count = new AtomicInteger(0);
|
||||
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
|
||||
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||
new BufferedSeparatorInputStream(in, 8192);
|
||||
ChunkListener<byte[], BytesReference> chunkListener = (chunk) -> count.incrementAndGet();
|
||||
Chunk<byte[], BytesReference> chunk;
|
||||
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
|
||||
|
@ -117,7 +119,8 @@ public class BufferedSeparatorInputStreamTest {
|
|||
Map<Integer, Integer> map2 = new LinkedHashMap<>();
|
||||
InputStream in2 = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream();
|
||||
final AtomicInteger count2 = new AtomicInteger(0);
|
||||
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in2);
|
||||
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||
new BufferedSeparatorInputStream(in2, 8192);
|
||||
ChunkListener<byte[], BytesReference> chunkListener2 =
|
||||
(chunk2) -> map2.put(count2.incrementAndGet(), chunk2.data().length());
|
||||
Chunk<byte[], BytesReference> chunk2;
|
||||
|
@ -151,7 +154,8 @@ public class BufferedSeparatorInputStreamTest {
|
|||
public void testChunkCount() throws Exception {
|
||||
String s = "periouni.mrc";
|
||||
InputStream in = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream();
|
||||
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
|
||||
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||
new BufferedSeparatorInputStream(in, 8192);
|
||||
long l = bufferedSeparatorInputStream.chunks().count();
|
||||
assertEquals(192247, l);
|
||||
}
|
||||
|
@ -162,7 +166,8 @@ public class BufferedSeparatorInputStreamTest {
|
|||
Map<Integer, Integer> map = new LinkedHashMap<>();
|
||||
InputStream in = getClass().getResource("/org/xbib/marc/" + s).openStream();
|
||||
final AtomicInteger count = new AtomicInteger(0);
|
||||
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
|
||||
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||
new BufferedSeparatorInputStream(in, 8192);
|
||||
ChunkListener<byte[], BytesReference> chunkListener =
|
||||
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
||||
Chunk<byte[], BytesReference> chunk;
|
||||
|
@ -174,5 +179,4 @@ public class BufferedSeparatorInputStreamTest {
|
|||
+ "31=2, 32=9, 33=9, 34=2, 35=6, 36=9, 37=0}", map.toString());
|
||||
in.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -38,7 +38,7 @@ public class PatternInputStreamTest {
|
|||
byte[] b = "Hello\nWorld".getBytes(StandardCharsets.UTF_8);
|
||||
Map<Integer, Integer> map = new LinkedHashMap<>();
|
||||
final AtomicInteger count = new AtomicInteger(0);
|
||||
PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b));
|
||||
PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b), 1024);
|
||||
ChunkListener<byte[], BytesReference> chunkListener =
|
||||
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
||||
Chunk<byte[], BytesReference> chunk;
|
||||
|
@ -55,7 +55,7 @@ public class PatternInputStreamTest {
|
|||
byte[] b = "Hello\r\nWorld".getBytes(StandardCharsets.UTF_8);
|
||||
Map<Integer, Integer> map = new LinkedHashMap<>();
|
||||
final AtomicInteger count = new AtomicInteger(0);
|
||||
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(b));
|
||||
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(b), 1024);
|
||||
ChunkListener<byte[], BytesReference> chunkListener =
|
||||
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
||||
Chunk<byte[], BytesReference> chunk;
|
||||
|
@ -74,7 +74,8 @@ public class PatternInputStreamTest {
|
|||
"Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8)
|
||||
};
|
||||
for (byte[] b : bytes) {
|
||||
PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b));
|
||||
PatternInputStream separatorStream =
|
||||
PatternInputStream.lf(new ByteArrayInputStream(b), 8192);
|
||||
long l = separatorStream.chunks().count();
|
||||
separatorStream.close();
|
||||
assertEquals(2L, l);
|
||||
|
@ -89,7 +90,8 @@ public class PatternInputStreamTest {
|
|||
}
|
||||
Map<Integer, Integer> map = new LinkedHashMap<>();
|
||||
final AtomicInteger count = new AtomicInteger(0);
|
||||
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()));
|
||||
PatternInputStream separatorStream =
|
||||
PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()), 8192);
|
||||
ChunkListener<byte[], BytesReference> chunkListener =
|
||||
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
||||
Chunk<byte[], BytesReference> chunk;
|
||||
|
@ -108,7 +110,8 @@ public class PatternInputStreamTest {
|
|||
output.write("Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
final AtomicInteger count = new AtomicInteger(0);
|
||||
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()));
|
||||
PatternInputStream separatorStream =
|
||||
PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()), 8192);
|
||||
separatorStream.chunks().forEach(chunk -> {
|
||||
count.incrementAndGet();
|
||||
assertEquals(5, chunk.data().length());
|
||||
|
|
102
src/test/java/org/xbib/marc/io/UncompressLargeFileTest.java
Normal file
102
src/test/java/org/xbib/marc/io/UncompressLargeFileTest.java
Normal file
|
@ -0,0 +1,102 @@
|
|||
package org.xbib.marc.io;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.marc4j.MarcPermissiveStreamReader;
|
||||
import org.marc4j.MarcReader;
|
||||
import org.marc4j.marc.Record;
|
||||
import org.xbib.marc.Marc;
|
||||
import org.xbib.marc.MarcRecord;
|
||||
import org.xbib.marc.MarcRecordListener;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
@Ignore
|
||||
public class UncompressLargeFileTest {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(UncompressLargeFileTest.class.getName());
|
||||
|
||||
@Test
|
||||
public void uncompress() throws IOException {
|
||||
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
|
||||
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
|
||||
byte[] buffer = new byte[1024 * 1024];
|
||||
int length;
|
||||
while ((length = gzipInputStream.read(buffer)) != -1) {
|
||||
// do nothing
|
||||
}
|
||||
gzipInputStream.close();
|
||||
inputStream.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void uncompressAndDecodeChunks() throws Exception {
|
||||
logger.log(Level.INFO, "start decoding chunks");
|
||||
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
|
||||
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
|
||||
final AtomicInteger counter = new AtomicInteger(0);
|
||||
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||
new BufferedSeparatorInputStream(gzipInputStream, 1024 * 1024);
|
||||
ChunkListener<byte[], BytesReference> chunkListener = (chunk) -> counter.incrementAndGet();
|
||||
Chunk<byte[], BytesReference> chunk;
|
||||
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
|
||||
chunkListener.chunk(chunk);
|
||||
}
|
||||
gzipInputStream.close();
|
||||
logger.log(Level.INFO, "stop decoding chunks, counter = " + counter.get());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void uncompressAndDecodeMarcRecords() throws IOException {
|
||||
logger.log(Level.INFO, "start decoding MARC");
|
||||
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
|
||||
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
|
||||
final AtomicInteger counter = new AtomicInteger(0);
|
||||
Marc.builder()
|
||||
.setInputStream(gzipInputStream)
|
||||
.setMarcRecordListener(new MarcRecordListener() {
|
||||
@Override
|
||||
public void beginCollection() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void record(MarcRecord marcRecord) {
|
||||
counter.incrementAndGet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endCollection() {
|
||||
}
|
||||
})
|
||||
.setCharset(StandardCharsets.UTF_8)
|
||||
.build()
|
||||
.writeRecords(1024 * 1024);
|
||||
gzipInputStream.close();
|
||||
inputStream.close();
|
||||
logger.log(Level.INFO, "stop deocding MARC, counter = " + counter.get());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void uncompressAndDecodeWithMarc4j() throws Exception {
|
||||
logger.log(Level.INFO, "start decoding MARC4J");
|
||||
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
|
||||
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
|
||||
final AtomicInteger counter = new AtomicInteger(0);
|
||||
MarcReader reader = new MarcPermissiveStreamReader(gzipInputStream, true, true);
|
||||
while (reader.hasNext()) {
|
||||
Record record = reader.next();
|
||||
counter.incrementAndGet();
|
||||
// do nothing
|
||||
}
|
||||
gzipInputStream.close();
|
||||
inputStream.close();
|
||||
logger.log(Level.INFO, "stop deocding MARC4J, counter = " + counter.get());
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue