add buffer size parameter, marc4j comparison test
This commit is contained in:
parent
c8b8955303
commit
b123d27567
19 changed files with 272 additions and 64 deletions
|
@ -27,6 +27,7 @@ dependencies {
|
||||||
testImplementation("org.mockito:mockito-core:${project.property('mockito.version')}") {
|
testImplementation("org.mockito:mockito-core:${project.property('mockito.version')}") {
|
||||||
exclude group: 'org.hamcrest'
|
exclude group: 'org.hamcrest'
|
||||||
}
|
}
|
||||||
|
testImplementation "org.marc4j:marc4j:${project.property('marc4j.version')}"
|
||||||
}
|
}
|
||||||
|
|
||||||
compileJava {
|
compileJava {
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
group = org.xbib
|
group = org.xbib
|
||||||
name = marc
|
name = marc
|
||||||
version = 2.1.0
|
version = 2.2.0
|
||||||
|
|
||||||
# main
|
# main
|
||||||
xbib-content.version = 2.0.4
|
xbib-content.version = 2.0.5
|
||||||
|
|
||||||
# runtime
|
# runtime
|
||||||
xbib-bibliographic-character-sets.version = 1.0.0
|
xbib-bibliographic-character-sets.version = 1.0.0
|
||||||
|
@ -15,3 +15,4 @@ xalan.version = 2.7.2
|
||||||
xmlunit-matchers.version = 2.6.3
|
xmlunit-matchers.version = 2.6.3
|
||||||
system-rules.version = 1.19.0
|
system-rules.version = 1.19.0
|
||||||
mockito.version = 3.1.0
|
mockito.version = 3.1.0
|
||||||
|
marc4j.version = 2.9.1
|
||||||
|
|
|
@ -87,6 +87,8 @@ public final class Marc {
|
||||||
|
|
||||||
private static final byte[] CRLF = { '\r', '\n'};
|
private static final byte[] CRLF = { '\r', '\n'};
|
||||||
|
|
||||||
|
private static final int DEFAULT_BUFFER_SIZE = 8192;
|
||||||
|
|
||||||
private final Builder builder;
|
private final Builder builder;
|
||||||
|
|
||||||
private Marc(Builder builder) {
|
private Marc(Builder builder) {
|
||||||
|
@ -101,12 +103,17 @@ public final class Marc {
|
||||||
return new Builder();
|
return new Builder();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public MarcIso2709Reader iso2709XmlReader() {
|
||||||
|
return iso2709XmlReader(DEFAULT_BUFFER_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return an XML reader on a ISO 2709 input stream.
|
* Return an XML reader on a ISO 2709 input stream.
|
||||||
|
* @param bufferSize buffer size for input stream
|
||||||
* @return XML reader
|
* @return XML reader
|
||||||
*/
|
*/
|
||||||
public MarcIso2709Reader iso2709XmlReader() {
|
public MarcIso2709Reader iso2709XmlReader(int bufferSize) {
|
||||||
return new MarcIso2709Reader(builder);
|
return new MarcIso2709Reader(builder, bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -137,12 +144,17 @@ public final class Marc {
|
||||||
xmlEventReader.close();
|
xmlEventReader.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public BufferedSeparatorInputStream iso2709Stream() {
|
||||||
|
return iso2709Stream(DEFAULT_BUFFER_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return ISO 2709 stream.
|
* Return ISO 2709 stream.
|
||||||
|
* @param bufferSize buffer size
|
||||||
* @return ISO 2709 stream
|
* @return ISO 2709 stream
|
||||||
*/
|
*/
|
||||||
public BufferedSeparatorInputStream iso2709Stream() {
|
public BufferedSeparatorInputStream iso2709Stream(int bufferSize) {
|
||||||
return builder.iso2709Stream();
|
return builder.iso2709Stream(bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -224,7 +236,8 @@ public final class Marc {
|
||||||
* @throws IOException if parsing fails
|
* @throws IOException if parsing fails
|
||||||
*/
|
*/
|
||||||
public Document document() throws IOException {
|
public Document document() throws IOException {
|
||||||
return new Sax2Dom(iso2709XmlReader(), new InputSource(builder.getInputStream())).document();
|
return new Sax2Dom(iso2709XmlReader(DEFAULT_BUFFER_SIZE),
|
||||||
|
new InputSource(builder.getInputStream())).document();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -260,16 +273,25 @@ public final class Marc {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void writeCollection() throws IOException {
|
||||||
|
writeCollection(DEFAULT_BUFFER_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write MARC bibliographic data from seperator stream chunk by chunk to a MARC collection.
|
* Write MARC bibliographic data from seperator stream chunk by chunk to a MARC collection.
|
||||||
|
* @param bufferSize buffer size for the separator input stream
|
||||||
* @throws IOException if writing fails
|
* @throws IOException if writing fails
|
||||||
*/
|
*/
|
||||||
public void writeCollection() throws IOException {
|
public void writeCollection(int bufferSize) throws IOException {
|
||||||
wrapIntoCollection(new BufferedSeparatorInputStream(builder.getInputStream()));
|
wrapIntoCollection(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void writeCollection(String type) throws IOException {
|
public void writeCollection(String type) throws IOException {
|
||||||
wrapIntoCollection(type, new BufferedSeparatorInputStream(builder.getInputStream()));
|
writeCollection(type, DEFAULT_BUFFER_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeCollection(String type, int bufferSize) throws IOException {
|
||||||
|
wrapIntoCollection(type, new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize));
|
||||||
}
|
}
|
||||||
|
|
||||||
public int wrapIntoCollection(ChunkStream<byte[], BytesReference> stream) throws IOException {
|
public int wrapIntoCollection(ChunkStream<byte[], BytesReference> stream) throws IOException {
|
||||||
|
@ -331,21 +353,31 @@ public final class Marc {
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void writeRecordCollection() throws IOException {
|
||||||
|
writeRecordCollection(DEFAULT_BUFFER_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write MARC bibliographic events from a separator strem, record by record, wrapped into a
|
* Write MARC bibliographic events from a separator strem, record by record, wrapped into a
|
||||||
* pair of {@code collection} elements.
|
* pair of {@code collection} elements.
|
||||||
|
* @param bufferSize buffer size
|
||||||
* @throws IOException if writing fails
|
* @throws IOException if writing fails
|
||||||
*/
|
*/
|
||||||
public void writeRecordCollection() throws IOException {
|
public void writeRecordCollection(int bufferSize) throws IOException {
|
||||||
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream()), true);
|
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeRecords() throws IOException {
|
||||||
|
writeRecords(DEFAULT_BUFFER_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write MARC bibliographic events from a separator strem, record by record.
|
* Write MARC bibliographic events from a separator strem, record by record.
|
||||||
|
* @param bufferSize buffer size for separator input stream
|
||||||
* @throws IOException if writing fails
|
* @throws IOException if writing fails
|
||||||
*/
|
*/
|
||||||
public void writeRecords() throws IOException {
|
public void writeRecords(int bufferSize) throws IOException {
|
||||||
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream()), false);
|
wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -551,8 +583,11 @@ public final class Marc {
|
||||||
*/
|
*/
|
||||||
public static class MarcIso2709Reader extends MarcXmlReader {
|
public static class MarcIso2709Reader extends MarcXmlReader {
|
||||||
|
|
||||||
private MarcIso2709Reader(Builder builder) {
|
private final int bufferSize;
|
||||||
|
|
||||||
|
private MarcIso2709Reader(Builder builder, int bufferSize) {
|
||||||
super(builder);
|
super(builder);
|
||||||
|
this.bufferSize = bufferSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -560,7 +595,8 @@ public final class Marc {
|
||||||
if (input.getByteStream() == null) {
|
if (input.getByteStream() == null) {
|
||||||
throw new IllegalArgumentException("no input stream found");
|
throw new IllegalArgumentException("no input stream found");
|
||||||
}
|
}
|
||||||
try (BufferedSeparatorInputStream stream = new BufferedSeparatorInputStream(input.getByteStream())) {
|
try (BufferedSeparatorInputStream stream =
|
||||||
|
new BufferedSeparatorInputStream(input.getByteStream(), bufferSize)) {
|
||||||
MarcGenerator marcGenerator = builder.createGenerator();
|
MarcGenerator marcGenerator = builder.createGenerator();
|
||||||
Chunk<byte[], BytesReference> chunk;
|
Chunk<byte[], BytesReference> chunk;
|
||||||
while ((chunk = stream.readChunk()) != null) {
|
while ((chunk = stream.readChunk()) != null) {
|
||||||
|
@ -895,10 +931,11 @@ public final class Marc {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create an ISO 2709 stream.
|
* Create an ISO 2709 stream.
|
||||||
|
* @param bufferSize buffer size
|
||||||
* @return ISO 2709 stream
|
* @return ISO 2709 stream
|
||||||
*/
|
*/
|
||||||
public BufferedSeparatorInputStream iso2709Stream() {
|
public BufferedSeparatorInputStream iso2709Stream(int bufferSize) {
|
||||||
return new BufferedSeparatorInputStream(inputStream);
|
return new BufferedSeparatorInputStream(inputStream, bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1088,7 +1125,7 @@ public final class Marc {
|
||||||
*/
|
*/
|
||||||
public Iterator<MarcRecord> recordIterator() {
|
public Iterator<MarcRecord> recordIterator() {
|
||||||
if (stream == null) {
|
if (stream == null) {
|
||||||
this.stream = new BufferedSeparatorInputStream(inputStream);
|
this.stream = new BufferedSeparatorInputStream(inputStream, DEFAULT_BUFFER_SIZE);
|
||||||
}
|
}
|
||||||
if (marcGenerator == null) {
|
if (marcGenerator == null) {
|
||||||
this.marcGenerator = createGenerator();
|
this.marcGenerator = createGenerator();
|
||||||
|
|
|
@ -43,12 +43,24 @@ public class AlephSequentialInputStream extends PatternInputStream {
|
||||||
|
|
||||||
private String alephSysNumber;
|
private String alephSysNumber;
|
||||||
|
|
||||||
public AlephSequentialInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
|
public AlephSequentialInputStream(InputStream in,
|
||||||
super(in, pattern);
|
byte[] pattern,
|
||||||
|
MarcGenerator marcGenerator) {
|
||||||
|
this(in, pattern, marcGenerator, 8192);
|
||||||
|
}
|
||||||
|
|
||||||
|
public AlephSequentialInputStream(InputStream in,
|
||||||
|
byte[] pattern,
|
||||||
|
MarcGenerator marcGenerator,
|
||||||
|
int bufferSize) {
|
||||||
|
super(in, pattern, bufferSize);
|
||||||
this.marcGenerator = marcGenerator;
|
this.marcGenerator = marcGenerator;
|
||||||
this.bytesStreamOutput = new BytesStreamOutput();
|
this.bytesStreamOutput = new BytesStreamOutput();
|
||||||
// this format might come without a record label, create a default one
|
// this format might come without a record label, create a default one
|
||||||
this.label = RecordLabel.builder().setIndicatorLength(2).setSubfieldIdentifierLength(1).build();
|
this.label = RecordLabel.builder()
|
||||||
|
.setIndicatorLength(2)
|
||||||
|
.setSubfieldIdentifierLength(1)
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -53,8 +53,17 @@ public class BiblioMondoInputStream extends PatternInputStream {
|
||||||
|
|
||||||
private final BytesStreamOutput bytesStreamOutput;
|
private final BytesStreamOutput bytesStreamOutput;
|
||||||
|
|
||||||
public BiblioMondoInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
|
public BiblioMondoInputStream(InputStream in,
|
||||||
super(in, pattern);
|
byte[] pattern,
|
||||||
|
MarcGenerator marcGenerator) {
|
||||||
|
this(in, pattern, marcGenerator, 8192);
|
||||||
|
}
|
||||||
|
|
||||||
|
public BiblioMondoInputStream(InputStream in,
|
||||||
|
byte[] pattern,
|
||||||
|
MarcGenerator marcGenerator,
|
||||||
|
int bufferSize) {
|
||||||
|
super(in, pattern, bufferSize);
|
||||||
this.marcGenerator = marcGenerator;
|
this.marcGenerator = marcGenerator;
|
||||||
this.bytesStreamOutput = new BytesStreamOutput();
|
this.bytesStreamOutput = new BytesStreamOutput();
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,8 +50,19 @@ public class MabDisketteInputStream extends PatternInputStream {
|
||||||
this(in, pattern, '\u0000', marcGenerator);
|
this(in, pattern, '\u0000', marcGenerator);
|
||||||
}
|
}
|
||||||
|
|
||||||
public MabDisketteInputStream(InputStream in, byte[] pattern, char subfieldDelimiter, MarcGenerator marcGenerator) {
|
public MabDisketteInputStream(InputStream in,
|
||||||
super(in, pattern);
|
byte[] pattern,
|
||||||
|
char subfieldDelimiter,
|
||||||
|
MarcGenerator marcGenerator) {
|
||||||
|
this(in, pattern, subfieldDelimiter, marcGenerator, 8192);
|
||||||
|
}
|
||||||
|
|
||||||
|
public MabDisketteInputStream(InputStream in,
|
||||||
|
byte[] pattern,
|
||||||
|
char subfieldDelimiter,
|
||||||
|
MarcGenerator marcGenerator,
|
||||||
|
int bufferSize) {
|
||||||
|
super(in, pattern, bufferSize);
|
||||||
this.marcGenerator = marcGenerator;
|
this.marcGenerator = marcGenerator;
|
||||||
this.subfieldDelimiter = subfieldDelimiter;
|
this.subfieldDelimiter = subfieldDelimiter;
|
||||||
this.bytesStreamOutput = new BytesStreamOutput();
|
this.bytesStreamOutput = new BytesStreamOutput();
|
||||||
|
|
|
@ -50,8 +50,17 @@ public class PicaInputStream extends PatternInputStream {
|
||||||
|
|
||||||
private final BytesStreamOutput bytesStreamOutput;
|
private final BytesStreamOutput bytesStreamOutput;
|
||||||
|
|
||||||
public PicaInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
|
public PicaInputStream(InputStream in,
|
||||||
super(in, pattern);
|
byte[] pattern,
|
||||||
|
MarcGenerator marcGenerator) {
|
||||||
|
this(in, pattern, marcGenerator, 8192);
|
||||||
|
}
|
||||||
|
|
||||||
|
public PicaInputStream(InputStream in,
|
||||||
|
byte[] pattern,
|
||||||
|
MarcGenerator marcGenerator,
|
||||||
|
int bufferSize) {
|
||||||
|
super(in, pattern, bufferSize);
|
||||||
this.marcGenerator = marcGenerator;
|
this.marcGenerator = marcGenerator;
|
||||||
this.bytesStreamOutput = new BytesStreamOutput();
|
this.bytesStreamOutput = new BytesStreamOutput();
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,8 +52,17 @@ public class PicaPlainInputStream extends PatternInputStream {
|
||||||
|
|
||||||
private boolean started;
|
private boolean started;
|
||||||
|
|
||||||
public PicaPlainInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
|
public PicaPlainInputStream(InputStream in,
|
||||||
super(in, pattern);
|
byte[] pattern,
|
||||||
|
MarcGenerator marcGenerator) {
|
||||||
|
this(in, pattern, marcGenerator, 8192);
|
||||||
|
}
|
||||||
|
|
||||||
|
public PicaPlainInputStream(InputStream in,
|
||||||
|
byte[] pattern,
|
||||||
|
MarcGenerator marcGenerator,
|
||||||
|
int bufferSize) {
|
||||||
|
super(in, pattern, bufferSize);
|
||||||
this.marcGenerator = marcGenerator;
|
this.marcGenerator = marcGenerator;
|
||||||
this.bytesStreamOutput = new BytesStreamOutput();
|
this.bytesStreamOutput = new BytesStreamOutput();
|
||||||
this.started = true;
|
this.started = true;
|
||||||
|
|
|
@ -72,14 +72,24 @@ public class SisisInputStream extends PatternInputStream {
|
||||||
|
|
||||||
private boolean labelEmitted;
|
private boolean labelEmitted;
|
||||||
|
|
||||||
|
public SisisInputStream(InputStream in,
|
||||||
|
byte[] pattern,
|
||||||
|
MarcGenerator marcGenerator) {
|
||||||
|
this(in, pattern, marcGenerator, 8192);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a SISIS input stream.
|
* Create a SISIS input stream.
|
||||||
* @param in the underlying input stream
|
* @param in the underlying input stream
|
||||||
* @param pattern the pattern for the separator
|
* @param pattern the pattern for the separator
|
||||||
* @param marcGenerator a MARC generator
|
* @param marcGenerator a MARC generator
|
||||||
|
* @param bufferSize buffer size
|
||||||
*/
|
*/
|
||||||
public SisisInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) {
|
public SisisInputStream(InputStream in,
|
||||||
super(in, pattern);
|
byte[] pattern,
|
||||||
|
MarcGenerator marcGenerator,
|
||||||
|
int bufferSize) {
|
||||||
|
super(in, pattern, bufferSize);
|
||||||
this.marcGenerator = marcGenerator;
|
this.marcGenerator = marcGenerator;
|
||||||
this.bytesStreamOutput = new BytesStreamOutput();
|
this.bytesStreamOutput = new BytesStreamOutput();
|
||||||
// this format comes without a record label, create a default one
|
// this format comes without a record label, create a default one
|
||||||
|
|
|
@ -45,14 +45,6 @@ abstract class BaseChunkStream extends BufferedInputStream implements ChunkStrea
|
||||||
|
|
||||||
int buffersize;
|
int buffersize;
|
||||||
|
|
||||||
/**
|
|
||||||
* Create a base chunk stream.
|
|
||||||
* @param in the underlying input stream
|
|
||||||
*/
|
|
||||||
BaseChunkStream(InputStream in) {
|
|
||||||
this(in, DEFAULT_BUFFER_SIZE);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a base chunk stream.
|
* Create a base chunk stream.
|
||||||
* @param in the underlying input stream
|
* @param in the underlying input stream
|
||||||
|
|
|
@ -70,9 +70,10 @@ public class BufferedSeparatorInputStream extends BaseChunkStream {
|
||||||
/**
|
/**
|
||||||
* Create a buffered information separator stream.
|
* Create a buffered information separator stream.
|
||||||
* @param in the underlying input stream
|
* @param in the underlying input stream
|
||||||
|
* @param bufferSize the buffer size
|
||||||
*/
|
*/
|
||||||
public BufferedSeparatorInputStream(InputStream in) {
|
public BufferedSeparatorInputStream(InputStream in, int bufferSize) {
|
||||||
super(in);
|
super(in, bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -39,9 +39,10 @@ public class PatternInputStream extends BaseChunkStream {
|
||||||
* Create a pattern delimited input stream.
|
* Create a pattern delimited input stream.
|
||||||
* @param in the underlying input stream
|
* @param in the underlying input stream
|
||||||
* @param pattern the pattern
|
* @param pattern the pattern
|
||||||
|
* @param bufferSize buffer size
|
||||||
*/
|
*/
|
||||||
public PatternInputStream(InputStream in, byte[] pattern) {
|
public PatternInputStream(InputStream in, byte[] pattern, int bufferSize) {
|
||||||
super(in);
|
super(in, bufferSize);
|
||||||
requireNonNull(pattern);
|
requireNonNull(pattern);
|
||||||
this.pattern = pattern.clone();
|
this.pattern = pattern.clone();
|
||||||
}
|
}
|
||||||
|
@ -49,19 +50,21 @@ public class PatternInputStream extends BaseChunkStream {
|
||||||
/**
|
/**
|
||||||
* Convenience method to cerate a line-feed pattern separated input stream.
|
* Convenience method to cerate a line-feed pattern separated input stream.
|
||||||
* @param in the input stream to wrap
|
* @param in the input stream to wrap
|
||||||
|
* @param bufferSize buffer size
|
||||||
* @return the pattern input stream
|
* @return the pattern input stream
|
||||||
*/
|
*/
|
||||||
public static PatternInputStream lf(InputStream in) {
|
public static PatternInputStream lf(InputStream in, int bufferSize) {
|
||||||
return new PatternInputStream(in, LF);
|
return new PatternInputStream(in, LF, bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convenience method to cerate a carriage-return/line-feed pattern separated input stream.
|
* Convenience method to cerate a carriage-return/line-feed pattern separated input stream.
|
||||||
* @param in the input stream to wrap
|
* @param in the input stream to wrap
|
||||||
|
* @param bufferSize buffer size
|
||||||
* @return the pattern input stream
|
* @return the pattern input stream
|
||||||
*/
|
*/
|
||||||
public static PatternInputStream crlf(InputStream in) {
|
public static PatternInputStream crlf(InputStream in, int bufferSize) {
|
||||||
return new PatternInputStream(in, CRLF);
|
return new PatternInputStream(in, CRLF, bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.xbib.marc.tools;
|
||||||
import org.xbib.marc.Marc;
|
import org.xbib.marc.Marc;
|
||||||
import org.xbib.marc.xml.MarcXchangeWriter;
|
import org.xbib.marc.xml.MarcXchangeWriter;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
@ -101,9 +102,10 @@ public class MarcTool {
|
||||||
}
|
}
|
||||||
if ("marc2xml".equals(mode)) {
|
if ("marc2xml".equals(mode)) {
|
||||||
try (InputStream in = Files.newInputStream(Paths.get(input));
|
try (InputStream in = Files.newInputStream(Paths.get(input));
|
||||||
|
BufferedInputStream bufferedInputStream = new BufferedInputStream(in, 65536);
|
||||||
MarcXchangeWriter writer = new MarcXchangeWriter(Files.newBufferedWriter(Paths.get(output)), true)) {
|
MarcXchangeWriter writer = new MarcXchangeWriter(Files.newBufferedWriter(Paths.get(output)), true)) {
|
||||||
Marc.Builder builder = Marc.builder()
|
Marc.Builder builder = Marc.builder()
|
||||||
.setInputStream(in)
|
.setInputStream(bufferedInputStream)
|
||||||
.setCharset(Charset.forName(charset))
|
.setCharset(Charset.forName(charset))
|
||||||
.setMarcListener(writer);
|
.setMarcListener(writer);
|
||||||
if (schema != null && stylesheet != null && result != null) {
|
if (schema != null && stylesheet != null && result != null) {
|
||||||
|
@ -111,7 +113,7 @@ public class MarcTool {
|
||||||
builder.setSchema(schema).build().transform(new URL(stylesheet),
|
builder.setSchema(schema).build().transform(new URL(stylesheet),
|
||||||
new StreamResult(Files.newBufferedWriter(Paths.get(result))));
|
new StreamResult(Files.newBufferedWriter(Paths.get(result))));
|
||||||
} else {
|
} else {
|
||||||
builder.build().writeCollection();
|
builder.build().writeCollection(65536);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.log(Level.SEVERE, e.getMessage(), e);
|
logger.log(Level.SEVERE, e.getMessage(), e);
|
||||||
|
|
|
@ -9,6 +9,7 @@ import org.xbib.marc.MarcRecord;
|
||||||
|
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -26,7 +27,7 @@ public class HBZTest {
|
||||||
try (InputStream in = getClass().getResource(file).openStream()) {
|
try (InputStream in = getClass().getResource(file).openStream()) {
|
||||||
Marc marc = Marc.builder()
|
Marc marc = Marc.builder()
|
||||||
.setInputStream(in)
|
.setInputStream(in)
|
||||||
.setCharset(Charset.forName("UTF-8"))
|
.setCharset(StandardCharsets.UTF_8)
|
||||||
.build();
|
.build();
|
||||||
marc.iso2709Stream().chunks().forEach(chunk -> {
|
marc.iso2709Stream().chunks().forEach(chunk -> {
|
||||||
count.incrementAndGet();
|
count.incrementAndGet();
|
||||||
|
|
|
@ -323,7 +323,7 @@ public class MabTest {
|
||||||
assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + "-mapped.xml").openStream()));
|
assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + "-mapped.xml").openStream()));
|
||||||
}
|
}
|
||||||
|
|
||||||
private class LOWTransformer implements MarcValueTransformer {
|
private static class LOWTransformer implements MarcValueTransformer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String transform(String value) {
|
public String transform(String value) {
|
||||||
|
|
|
@ -10,6 +10,7 @@ import org.xbib.marc.label.RecordLabel;
|
||||||
|
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -29,7 +30,7 @@ public class OBVSGTest {
|
||||||
try (InputStream in = getClass().getResource(file).openStream()) {
|
try (InputStream in = getClass().getResource(file).openStream()) {
|
||||||
Marc marc = Marc.builder()
|
Marc marc = Marc.builder()
|
||||||
.setInputStream(in)
|
.setInputStream(in)
|
||||||
.setCharset(Charset.forName("UTF-8"))
|
.setCharset(StandardCharsets.UTF_8)
|
||||||
.build();
|
.build();
|
||||||
marc.iso2709Stream().chunks().forEach(chunk -> {
|
marc.iso2709Stream().chunks().forEach(chunk -> {
|
||||||
count.incrementAndGet();
|
count.incrementAndGet();
|
||||||
|
|
|
@ -83,7 +83,8 @@ public class BufferedSeparatorInputStreamTest {
|
||||||
};
|
};
|
||||||
String s = "sequential.groupstream";
|
String s = "sequential.groupstream";
|
||||||
InputStream in = getClass().getResource(s).openStream();
|
InputStream in = getClass().getResource(s).openStream();
|
||||||
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
|
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||||
|
new BufferedSeparatorInputStream(in, 8192);
|
||||||
Chunk<byte[], BytesReference> chunk;
|
Chunk<byte[], BytesReference> chunk;
|
||||||
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
|
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
|
||||||
listener.chunk(chunk);
|
listener.chunk(chunk);
|
||||||
|
@ -101,7 +102,8 @@ public class BufferedSeparatorInputStreamTest {
|
||||||
String s = "sequential.groupstream";
|
String s = "sequential.groupstream";
|
||||||
InputStream in = getClass().getResource(s).openStream();
|
InputStream in = getClass().getResource(s).openStream();
|
||||||
final AtomicInteger count = new AtomicInteger(0);
|
final AtomicInteger count = new AtomicInteger(0);
|
||||||
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
|
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||||
|
new BufferedSeparatorInputStream(in, 8192);
|
||||||
ChunkListener<byte[], BytesReference> chunkListener = (chunk) -> count.incrementAndGet();
|
ChunkListener<byte[], BytesReference> chunkListener = (chunk) -> count.incrementAndGet();
|
||||||
Chunk<byte[], BytesReference> chunk;
|
Chunk<byte[], BytesReference> chunk;
|
||||||
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
|
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
|
||||||
|
@ -117,7 +119,8 @@ public class BufferedSeparatorInputStreamTest {
|
||||||
Map<Integer, Integer> map2 = new LinkedHashMap<>();
|
Map<Integer, Integer> map2 = new LinkedHashMap<>();
|
||||||
InputStream in2 = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream();
|
InputStream in2 = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream();
|
||||||
final AtomicInteger count2 = new AtomicInteger(0);
|
final AtomicInteger count2 = new AtomicInteger(0);
|
||||||
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in2);
|
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||||
|
new BufferedSeparatorInputStream(in2, 8192);
|
||||||
ChunkListener<byte[], BytesReference> chunkListener2 =
|
ChunkListener<byte[], BytesReference> chunkListener2 =
|
||||||
(chunk2) -> map2.put(count2.incrementAndGet(), chunk2.data().length());
|
(chunk2) -> map2.put(count2.incrementAndGet(), chunk2.data().length());
|
||||||
Chunk<byte[], BytesReference> chunk2;
|
Chunk<byte[], BytesReference> chunk2;
|
||||||
|
@ -151,7 +154,8 @@ public class BufferedSeparatorInputStreamTest {
|
||||||
public void testChunkCount() throws Exception {
|
public void testChunkCount() throws Exception {
|
||||||
String s = "periouni.mrc";
|
String s = "periouni.mrc";
|
||||||
InputStream in = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream();
|
InputStream in = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream();
|
||||||
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
|
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||||
|
new BufferedSeparatorInputStream(in, 8192);
|
||||||
long l = bufferedSeparatorInputStream.chunks().count();
|
long l = bufferedSeparatorInputStream.chunks().count();
|
||||||
assertEquals(192247, l);
|
assertEquals(192247, l);
|
||||||
}
|
}
|
||||||
|
@ -162,7 +166,8 @@ public class BufferedSeparatorInputStreamTest {
|
||||||
Map<Integer, Integer> map = new LinkedHashMap<>();
|
Map<Integer, Integer> map = new LinkedHashMap<>();
|
||||||
InputStream in = getClass().getResource("/org/xbib/marc/" + s).openStream();
|
InputStream in = getClass().getResource("/org/xbib/marc/" + s).openStream();
|
||||||
final AtomicInteger count = new AtomicInteger(0);
|
final AtomicInteger count = new AtomicInteger(0);
|
||||||
BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in);
|
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||||
|
new BufferedSeparatorInputStream(in, 8192);
|
||||||
ChunkListener<byte[], BytesReference> chunkListener =
|
ChunkListener<byte[], BytesReference> chunkListener =
|
||||||
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
||||||
Chunk<byte[], BytesReference> chunk;
|
Chunk<byte[], BytesReference> chunk;
|
||||||
|
@ -174,5 +179,4 @@ public class BufferedSeparatorInputStreamTest {
|
||||||
+ "31=2, 32=9, 33=9, 34=2, 35=6, 36=9, 37=0}", map.toString());
|
+ "31=2, 32=9, 33=9, 34=2, 35=6, 36=9, 37=0}", map.toString());
|
||||||
in.close();
|
in.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ public class PatternInputStreamTest {
|
||||||
byte[] b = "Hello\nWorld".getBytes(StandardCharsets.UTF_8);
|
byte[] b = "Hello\nWorld".getBytes(StandardCharsets.UTF_8);
|
||||||
Map<Integer, Integer> map = new LinkedHashMap<>();
|
Map<Integer, Integer> map = new LinkedHashMap<>();
|
||||||
final AtomicInteger count = new AtomicInteger(0);
|
final AtomicInteger count = new AtomicInteger(0);
|
||||||
PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b));
|
PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b), 1024);
|
||||||
ChunkListener<byte[], BytesReference> chunkListener =
|
ChunkListener<byte[], BytesReference> chunkListener =
|
||||||
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
||||||
Chunk<byte[], BytesReference> chunk;
|
Chunk<byte[], BytesReference> chunk;
|
||||||
|
@ -55,7 +55,7 @@ public class PatternInputStreamTest {
|
||||||
byte[] b = "Hello\r\nWorld".getBytes(StandardCharsets.UTF_8);
|
byte[] b = "Hello\r\nWorld".getBytes(StandardCharsets.UTF_8);
|
||||||
Map<Integer, Integer> map = new LinkedHashMap<>();
|
Map<Integer, Integer> map = new LinkedHashMap<>();
|
||||||
final AtomicInteger count = new AtomicInteger(0);
|
final AtomicInteger count = new AtomicInteger(0);
|
||||||
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(b));
|
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(b), 1024);
|
||||||
ChunkListener<byte[], BytesReference> chunkListener =
|
ChunkListener<byte[], BytesReference> chunkListener =
|
||||||
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
||||||
Chunk<byte[], BytesReference> chunk;
|
Chunk<byte[], BytesReference> chunk;
|
||||||
|
@ -74,7 +74,8 @@ public class PatternInputStreamTest {
|
||||||
"Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8)
|
"Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8)
|
||||||
};
|
};
|
||||||
for (byte[] b : bytes) {
|
for (byte[] b : bytes) {
|
||||||
PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b));
|
PatternInputStream separatorStream =
|
||||||
|
PatternInputStream.lf(new ByteArrayInputStream(b), 8192);
|
||||||
long l = separatorStream.chunks().count();
|
long l = separatorStream.chunks().count();
|
||||||
separatorStream.close();
|
separatorStream.close();
|
||||||
assertEquals(2L, l);
|
assertEquals(2L, l);
|
||||||
|
@ -89,7 +90,8 @@ public class PatternInputStreamTest {
|
||||||
}
|
}
|
||||||
Map<Integer, Integer> map = new LinkedHashMap<>();
|
Map<Integer, Integer> map = new LinkedHashMap<>();
|
||||||
final AtomicInteger count = new AtomicInteger(0);
|
final AtomicInteger count = new AtomicInteger(0);
|
||||||
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()));
|
PatternInputStream separatorStream =
|
||||||
|
PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()), 8192);
|
||||||
ChunkListener<byte[], BytesReference> chunkListener =
|
ChunkListener<byte[], BytesReference> chunkListener =
|
||||||
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
(chunk) -> map.put(count.incrementAndGet(), chunk.data().length());
|
||||||
Chunk<byte[], BytesReference> chunk;
|
Chunk<byte[], BytesReference> chunk;
|
||||||
|
@ -108,7 +110,8 @@ public class PatternInputStreamTest {
|
||||||
output.write("Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8));
|
output.write("Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8));
|
||||||
}
|
}
|
||||||
final AtomicInteger count = new AtomicInteger(0);
|
final AtomicInteger count = new AtomicInteger(0);
|
||||||
PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()));
|
PatternInputStream separatorStream =
|
||||||
|
PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()), 8192);
|
||||||
separatorStream.chunks().forEach(chunk -> {
|
separatorStream.chunks().forEach(chunk -> {
|
||||||
count.incrementAndGet();
|
count.incrementAndGet();
|
||||||
assertEquals(5, chunk.data().length());
|
assertEquals(5, chunk.data().length());
|
||||||
|
|
102
src/test/java/org/xbib/marc/io/UncompressLargeFileTest.java
Normal file
102
src/test/java/org/xbib/marc/io/UncompressLargeFileTest.java
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
package org.xbib.marc.io;
|
||||||
|
|
||||||
|
import org.junit.Ignore;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.marc4j.MarcPermissiveStreamReader;
|
||||||
|
import org.marc4j.MarcReader;
|
||||||
|
import org.marc4j.marc.Record;
|
||||||
|
import org.xbib.marc.Marc;
|
||||||
|
import org.xbib.marc.MarcRecord;
|
||||||
|
import org.xbib.marc.MarcRecordListener;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import java.util.logging.Logger;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
@Ignore
|
||||||
|
public class UncompressLargeFileTest {
|
||||||
|
|
||||||
|
private static final Logger logger = Logger.getLogger(UncompressLargeFileTest.class.getName());
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void uncompress() throws IOException {
|
||||||
|
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
|
||||||
|
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
|
||||||
|
byte[] buffer = new byte[1024 * 1024];
|
||||||
|
int length;
|
||||||
|
while ((length = gzipInputStream.read(buffer)) != -1) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
gzipInputStream.close();
|
||||||
|
inputStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void uncompressAndDecodeChunks() throws Exception {
|
||||||
|
logger.log(Level.INFO, "start decoding chunks");
|
||||||
|
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
|
||||||
|
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
|
||||||
|
final AtomicInteger counter = new AtomicInteger(0);
|
||||||
|
BufferedSeparatorInputStream bufferedSeparatorInputStream =
|
||||||
|
new BufferedSeparatorInputStream(gzipInputStream, 1024 * 1024);
|
||||||
|
ChunkListener<byte[], BytesReference> chunkListener = (chunk) -> counter.incrementAndGet();
|
||||||
|
Chunk<byte[], BytesReference> chunk;
|
||||||
|
while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) {
|
||||||
|
chunkListener.chunk(chunk);
|
||||||
|
}
|
||||||
|
gzipInputStream.close();
|
||||||
|
logger.log(Level.INFO, "stop decoding chunks, counter = " + counter.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void uncompressAndDecodeMarcRecords() throws IOException {
|
||||||
|
logger.log(Level.INFO, "start decoding MARC");
|
||||||
|
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
|
||||||
|
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
|
||||||
|
final AtomicInteger counter = new AtomicInteger(0);
|
||||||
|
Marc.builder()
|
||||||
|
.setInputStream(gzipInputStream)
|
||||||
|
.setMarcRecordListener(new MarcRecordListener() {
|
||||||
|
@Override
|
||||||
|
public void beginCollection() {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void record(MarcRecord marcRecord) {
|
||||||
|
counter.incrementAndGet();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void endCollection() {
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.setCharset(StandardCharsets.UTF_8)
|
||||||
|
.build()
|
||||||
|
.writeRecords(1024 * 1024);
|
||||||
|
gzipInputStream.close();
|
||||||
|
inputStream.close();
|
||||||
|
logger.log(Level.INFO, "stop deocding MARC, counter = " + counter.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void uncompressAndDecodeWithMarc4j() throws Exception {
|
||||||
|
logger.log(Level.INFO, "start decoding MARC4J");
|
||||||
|
InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz"));
|
||||||
|
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024);
|
||||||
|
final AtomicInteger counter = new AtomicInteger(0);
|
||||||
|
MarcReader reader = new MarcPermissiveStreamReader(gzipInputStream, true, true);
|
||||||
|
while (reader.hasNext()) {
|
||||||
|
Record record = reader.next();
|
||||||
|
counter.incrementAndGet();
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
gzipInputStream.close();
|
||||||
|
inputStream.close();
|
||||||
|
logger.log(Level.INFO, "stop deocding MARC4J, counter = " + counter.get());
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue