diff --git a/build.gradle b/build.gradle index 1c775a0..3974c71 100644 --- a/build.gradle +++ b/build.gradle @@ -27,6 +27,7 @@ dependencies { testImplementation("org.mockito:mockito-core:${project.property('mockito.version')}") { exclude group: 'org.hamcrest' } + testImplementation "org.marc4j:marc4j:${project.property('marc4j.version')}" } compileJava { diff --git a/gradle.properties b/gradle.properties index 0f2ba58..52ad92a 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,9 +1,9 @@ group = org.xbib name = marc -version = 2.1.0 +version = 2.2.0 # main -xbib-content.version = 2.0.4 +xbib-content.version = 2.0.5 # runtime xbib-bibliographic-character-sets.version = 1.0.0 @@ -15,3 +15,4 @@ xalan.version = 2.7.2 xmlunit-matchers.version = 2.6.3 system-rules.version = 1.19.0 mockito.version = 3.1.0 +marc4j.version = 2.9.1 diff --git a/src/main/java/org/xbib/marc/Marc.java b/src/main/java/org/xbib/marc/Marc.java index 2e1521f..d1d6c59 100644 --- a/src/main/java/org/xbib/marc/Marc.java +++ b/src/main/java/org/xbib/marc/Marc.java @@ -87,6 +87,8 @@ public final class Marc { private static final byte[] CRLF = { '\r', '\n'}; + private static final int DEFAULT_BUFFER_SIZE = 8192; + private final Builder builder; private Marc(Builder builder) { @@ -101,12 +103,17 @@ public final class Marc { return new Builder(); } + public MarcIso2709Reader iso2709XmlReader() { + return iso2709XmlReader(DEFAULT_BUFFER_SIZE); + } + /** * Return an XML reader on a ISO 2709 input stream. + * @param bufferSize buffer size for input stream * @return XML reader */ - public MarcIso2709Reader iso2709XmlReader() { - return new MarcIso2709Reader(builder); + public MarcIso2709Reader iso2709XmlReader(int bufferSize) { + return new MarcIso2709Reader(builder, bufferSize); } /** @@ -137,12 +144,17 @@ public final class Marc { xmlEventReader.close(); } + public BufferedSeparatorInputStream iso2709Stream() { + return iso2709Stream(DEFAULT_BUFFER_SIZE); + } + /** * Return ISO 2709 stream. + * @param bufferSize buffer size * @return ISO 2709 stream */ - public BufferedSeparatorInputStream iso2709Stream() { - return builder.iso2709Stream(); + public BufferedSeparatorInputStream iso2709Stream(int bufferSize) { + return builder.iso2709Stream(bufferSize); } /** @@ -224,7 +236,8 @@ public final class Marc { * @throws IOException if parsing fails */ public Document document() throws IOException { - return new Sax2Dom(iso2709XmlReader(), new InputSource(builder.getInputStream())).document(); + return new Sax2Dom(iso2709XmlReader(DEFAULT_BUFFER_SIZE), + new InputSource(builder.getInputStream())).document(); } /** @@ -260,16 +273,25 @@ public final class Marc { } } + public void writeCollection() throws IOException { + writeCollection(DEFAULT_BUFFER_SIZE); + } + /** * Write MARC bibliographic data from seperator stream chunk by chunk to a MARC collection. + * @param bufferSize buffer size for the separator input stream * @throws IOException if writing fails */ - public void writeCollection() throws IOException { - wrapIntoCollection(new BufferedSeparatorInputStream(builder.getInputStream())); + public void writeCollection(int bufferSize) throws IOException { + wrapIntoCollection(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize)); } public void writeCollection(String type) throws IOException { - wrapIntoCollection(type, new BufferedSeparatorInputStream(builder.getInputStream())); + writeCollection(type, DEFAULT_BUFFER_SIZE); + } + + public void writeCollection(String type, int bufferSize) throws IOException { + wrapIntoCollection(type, new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize)); } public int wrapIntoCollection(ChunkStream stream) throws IOException { @@ -331,21 +353,31 @@ public final class Marc { return count; } + public void writeRecordCollection() throws IOException { + writeRecordCollection(DEFAULT_BUFFER_SIZE); + } + /** * Write MARC bibliographic events from a separator strem, record by record, wrapped into a * pair of {@code collection} elements. + * @param bufferSize buffer size * @throws IOException if writing fails */ - public void writeRecordCollection() throws IOException { - wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream()), true); + public void writeRecordCollection(int bufferSize) throws IOException { + wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize), true); + } + + public void writeRecords() throws IOException { + writeRecords(DEFAULT_BUFFER_SIZE); } /** * Write MARC bibliographic events from a separator strem, record by record. + * @param bufferSize buffer size for separator input stream * @throws IOException if writing fails */ - public void writeRecords() throws IOException { - wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream()), false); + public void writeRecords(int bufferSize) throws IOException { + wrapRecords(new BufferedSeparatorInputStream(builder.getInputStream(), bufferSize), false); } /** @@ -551,8 +583,11 @@ public final class Marc { */ public static class MarcIso2709Reader extends MarcXmlReader { - private MarcIso2709Reader(Builder builder) { + private final int bufferSize; + + private MarcIso2709Reader(Builder builder, int bufferSize) { super(builder); + this.bufferSize = bufferSize; } @Override @@ -560,7 +595,8 @@ public final class Marc { if (input.getByteStream() == null) { throw new IllegalArgumentException("no input stream found"); } - try (BufferedSeparatorInputStream stream = new BufferedSeparatorInputStream(input.getByteStream())) { + try (BufferedSeparatorInputStream stream = + new BufferedSeparatorInputStream(input.getByteStream(), bufferSize)) { MarcGenerator marcGenerator = builder.createGenerator(); Chunk chunk; while ((chunk = stream.readChunk()) != null) { @@ -895,10 +931,11 @@ public final class Marc { /** * Create an ISO 2709 stream. + * @param bufferSize buffer size * @return ISO 2709 stream */ - public BufferedSeparatorInputStream iso2709Stream() { - return new BufferedSeparatorInputStream(inputStream); + public BufferedSeparatorInputStream iso2709Stream(int bufferSize) { + return new BufferedSeparatorInputStream(inputStream, bufferSize); } /** @@ -1088,7 +1125,7 @@ public final class Marc { */ public Iterator recordIterator() { if (stream == null) { - this.stream = new BufferedSeparatorInputStream(inputStream); + this.stream = new BufferedSeparatorInputStream(inputStream, DEFAULT_BUFFER_SIZE); } if (marcGenerator == null) { this.marcGenerator = createGenerator(); diff --git a/src/main/java/org/xbib/marc/dialects/aleph/AlephSequentialInputStream.java b/src/main/java/org/xbib/marc/dialects/aleph/AlephSequentialInputStream.java index ccd6501..2d634f8 100644 --- a/src/main/java/org/xbib/marc/dialects/aleph/AlephSequentialInputStream.java +++ b/src/main/java/org/xbib/marc/dialects/aleph/AlephSequentialInputStream.java @@ -43,12 +43,24 @@ public class AlephSequentialInputStream extends PatternInputStream { private String alephSysNumber; - public AlephSequentialInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) { - super(in, pattern); + public AlephSequentialInputStream(InputStream in, + byte[] pattern, + MarcGenerator marcGenerator) { + this(in, pattern, marcGenerator, 8192); + } + + public AlephSequentialInputStream(InputStream in, + byte[] pattern, + MarcGenerator marcGenerator, + int bufferSize) { + super(in, pattern, bufferSize); this.marcGenerator = marcGenerator; this.bytesStreamOutput = new BytesStreamOutput(); // this format might come without a record label, create a default one - this.label = RecordLabel.builder().setIndicatorLength(2).setSubfieldIdentifierLength(1).build(); + this.label = RecordLabel.builder() + .setIndicatorLength(2) + .setSubfieldIdentifierLength(1) + .build(); } @Override diff --git a/src/main/java/org/xbib/marc/dialects/bibliomondo/BiblioMondoInputStream.java b/src/main/java/org/xbib/marc/dialects/bibliomondo/BiblioMondoInputStream.java index 8301530..eba13ba 100644 --- a/src/main/java/org/xbib/marc/dialects/bibliomondo/BiblioMondoInputStream.java +++ b/src/main/java/org/xbib/marc/dialects/bibliomondo/BiblioMondoInputStream.java @@ -53,8 +53,17 @@ public class BiblioMondoInputStream extends PatternInputStream { private final BytesStreamOutput bytesStreamOutput; - public BiblioMondoInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) { - super(in, pattern); + public BiblioMondoInputStream(InputStream in, + byte[] pattern, + MarcGenerator marcGenerator) { + this(in, pattern, marcGenerator, 8192); + } + + public BiblioMondoInputStream(InputStream in, + byte[] pattern, + MarcGenerator marcGenerator, + int bufferSize) { + super(in, pattern, bufferSize); this.marcGenerator = marcGenerator; this.bytesStreamOutput = new BytesStreamOutput(); } diff --git a/src/main/java/org/xbib/marc/dialects/mab/diskette/MabDisketteInputStream.java b/src/main/java/org/xbib/marc/dialects/mab/diskette/MabDisketteInputStream.java index b3749d1..54150ed 100644 --- a/src/main/java/org/xbib/marc/dialects/mab/diskette/MabDisketteInputStream.java +++ b/src/main/java/org/xbib/marc/dialects/mab/diskette/MabDisketteInputStream.java @@ -50,8 +50,19 @@ public class MabDisketteInputStream extends PatternInputStream { this(in, pattern, '\u0000', marcGenerator); } - public MabDisketteInputStream(InputStream in, byte[] pattern, char subfieldDelimiter, MarcGenerator marcGenerator) { - super(in, pattern); + public MabDisketteInputStream(InputStream in, + byte[] pattern, + char subfieldDelimiter, + MarcGenerator marcGenerator) { + this(in, pattern, subfieldDelimiter, marcGenerator, 8192); + } + + public MabDisketteInputStream(InputStream in, + byte[] pattern, + char subfieldDelimiter, + MarcGenerator marcGenerator, + int bufferSize) { + super(in, pattern, bufferSize); this.marcGenerator = marcGenerator; this.subfieldDelimiter = subfieldDelimiter; this.bytesStreamOutput = new BytesStreamOutput(); diff --git a/src/main/java/org/xbib/marc/dialects/pica/PicaInputStream.java b/src/main/java/org/xbib/marc/dialects/pica/PicaInputStream.java index 024b18a..93fcc38 100644 --- a/src/main/java/org/xbib/marc/dialects/pica/PicaInputStream.java +++ b/src/main/java/org/xbib/marc/dialects/pica/PicaInputStream.java @@ -50,8 +50,17 @@ public class PicaInputStream extends PatternInputStream { private final BytesStreamOutput bytesStreamOutput; - public PicaInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) { - super(in, pattern); + public PicaInputStream(InputStream in, + byte[] pattern, + MarcGenerator marcGenerator) { + this(in, pattern, marcGenerator, 8192); + } + + public PicaInputStream(InputStream in, + byte[] pattern, + MarcGenerator marcGenerator, + int bufferSize) { + super(in, pattern, bufferSize); this.marcGenerator = marcGenerator; this.bytesStreamOutput = new BytesStreamOutput(); } diff --git a/src/main/java/org/xbib/marc/dialects/pica/PicaPlainInputStream.java b/src/main/java/org/xbib/marc/dialects/pica/PicaPlainInputStream.java index 1157f67..e5f620d 100644 --- a/src/main/java/org/xbib/marc/dialects/pica/PicaPlainInputStream.java +++ b/src/main/java/org/xbib/marc/dialects/pica/PicaPlainInputStream.java @@ -52,8 +52,17 @@ public class PicaPlainInputStream extends PatternInputStream { private boolean started; - public PicaPlainInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) { - super(in, pattern); + public PicaPlainInputStream(InputStream in, + byte[] pattern, + MarcGenerator marcGenerator) { + this(in, pattern, marcGenerator, 8192); + } + + public PicaPlainInputStream(InputStream in, + byte[] pattern, + MarcGenerator marcGenerator, + int bufferSize) { + super(in, pattern, bufferSize); this.marcGenerator = marcGenerator; this.bytesStreamOutput = new BytesStreamOutput(); this.started = true; diff --git a/src/main/java/org/xbib/marc/dialects/sisis/SisisInputStream.java b/src/main/java/org/xbib/marc/dialects/sisis/SisisInputStream.java index 29d1b24..0ed36df 100644 --- a/src/main/java/org/xbib/marc/dialects/sisis/SisisInputStream.java +++ b/src/main/java/org/xbib/marc/dialects/sisis/SisisInputStream.java @@ -72,14 +72,24 @@ public class SisisInputStream extends PatternInputStream { private boolean labelEmitted; + public SisisInputStream(InputStream in, + byte[] pattern, + MarcGenerator marcGenerator) { + this(in, pattern, marcGenerator, 8192); + } + /** * Create a SISIS input stream. * @param in the underlying input stream * @param pattern the pattern for the separator * @param marcGenerator a MARC generator + * @param bufferSize buffer size */ - public SisisInputStream(InputStream in, byte[] pattern, MarcGenerator marcGenerator) { - super(in, pattern); + public SisisInputStream(InputStream in, + byte[] pattern, + MarcGenerator marcGenerator, + int bufferSize) { + super(in, pattern, bufferSize); this.marcGenerator = marcGenerator; this.bytesStreamOutput = new BytesStreamOutput(); // this format comes without a record label, create a default one diff --git a/src/main/java/org/xbib/marc/io/BaseChunkStream.java b/src/main/java/org/xbib/marc/io/BaseChunkStream.java index 3174ea9..59818b8 100644 --- a/src/main/java/org/xbib/marc/io/BaseChunkStream.java +++ b/src/main/java/org/xbib/marc/io/BaseChunkStream.java @@ -45,14 +45,6 @@ abstract class BaseChunkStream extends BufferedInputStream implements ChunkStrea int buffersize; - /** - * Create a base chunk stream. - * @param in the underlying input stream - */ - BaseChunkStream(InputStream in) { - this(in, DEFAULT_BUFFER_SIZE); - } - /** * Create a base chunk stream. * @param in the underlying input stream diff --git a/src/main/java/org/xbib/marc/io/BufferedSeparatorInputStream.java b/src/main/java/org/xbib/marc/io/BufferedSeparatorInputStream.java index 65bcfdf..5a01d94 100644 --- a/src/main/java/org/xbib/marc/io/BufferedSeparatorInputStream.java +++ b/src/main/java/org/xbib/marc/io/BufferedSeparatorInputStream.java @@ -70,9 +70,10 @@ public class BufferedSeparatorInputStream extends BaseChunkStream { /** * Create a buffered information separator stream. * @param in the underlying input stream + * @param bufferSize the buffer size */ - public BufferedSeparatorInputStream(InputStream in) { - super(in); + public BufferedSeparatorInputStream(InputStream in, int bufferSize) { + super(in, bufferSize); } @Override diff --git a/src/main/java/org/xbib/marc/io/PatternInputStream.java b/src/main/java/org/xbib/marc/io/PatternInputStream.java index 6e9216f..0a21c7c 100644 --- a/src/main/java/org/xbib/marc/io/PatternInputStream.java +++ b/src/main/java/org/xbib/marc/io/PatternInputStream.java @@ -39,9 +39,10 @@ public class PatternInputStream extends BaseChunkStream { * Create a pattern delimited input stream. * @param in the underlying input stream * @param pattern the pattern + * @param bufferSize buffer size */ - public PatternInputStream(InputStream in, byte[] pattern) { - super(in); + public PatternInputStream(InputStream in, byte[] pattern, int bufferSize) { + super(in, bufferSize); requireNonNull(pattern); this.pattern = pattern.clone(); } @@ -49,19 +50,21 @@ public class PatternInputStream extends BaseChunkStream { /** * Convenience method to cerate a line-feed pattern separated input stream. * @param in the input stream to wrap + * @param bufferSize buffer size * @return the pattern input stream */ - public static PatternInputStream lf(InputStream in) { - return new PatternInputStream(in, LF); + public static PatternInputStream lf(InputStream in, int bufferSize) { + return new PatternInputStream(in, LF, bufferSize); } /** * Convenience method to cerate a carriage-return/line-feed pattern separated input stream. * @param in the input stream to wrap + * @param bufferSize buffer size * @return the pattern input stream */ - public static PatternInputStream crlf(InputStream in) { - return new PatternInputStream(in, CRLF); + public static PatternInputStream crlf(InputStream in, int bufferSize) { + return new PatternInputStream(in, CRLF, bufferSize); } /** diff --git a/src/main/java/org/xbib/marc/tools/MarcTool.java b/src/main/java/org/xbib/marc/tools/MarcTool.java index 1c79e41..a17f821 100644 --- a/src/main/java/org/xbib/marc/tools/MarcTool.java +++ b/src/main/java/org/xbib/marc/tools/MarcTool.java @@ -19,6 +19,7 @@ package org.xbib.marc.tools; import org.xbib.marc.Marc; import org.xbib.marc.xml.MarcXchangeWriter; +import java.io.BufferedInputStream; import java.io.InputStream; import java.net.URL; import java.nio.charset.Charset; @@ -101,9 +102,10 @@ public class MarcTool { } if ("marc2xml".equals(mode)) { try (InputStream in = Files.newInputStream(Paths.get(input)); + BufferedInputStream bufferedInputStream = new BufferedInputStream(in, 65536); MarcXchangeWriter writer = new MarcXchangeWriter(Files.newBufferedWriter(Paths.get(output)), true)) { Marc.Builder builder = Marc.builder() - .setInputStream(in) + .setInputStream(bufferedInputStream) .setCharset(Charset.forName(charset)) .setMarcListener(writer); if (schema != null && stylesheet != null && result != null) { @@ -111,7 +113,7 @@ public class MarcTool { builder.setSchema(schema).build().transform(new URL(stylesheet), new StreamResult(Files.newBufferedWriter(Paths.get(result)))); } else { - builder.build().writeCollection(); + builder.build().writeCollection(65536); } } catch (Exception e) { logger.log(Level.SEVERE, e.getMessage(), e); diff --git a/src/test/java/org/xbib/marc/dialects/mab/HBZTest.java b/src/test/java/org/xbib/marc/dialects/mab/HBZTest.java index eadbdf0..16b7fae 100644 --- a/src/test/java/org/xbib/marc/dialects/mab/HBZTest.java +++ b/src/test/java/org/xbib/marc/dialects/mab/HBZTest.java @@ -9,6 +9,7 @@ import org.xbib.marc.MarcRecord; import java.io.InputStream; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.concurrent.atomic.AtomicInteger; /** @@ -26,7 +27,7 @@ public class HBZTest { try (InputStream in = getClass().getResource(file).openStream()) { Marc marc = Marc.builder() .setInputStream(in) - .setCharset(Charset.forName("UTF-8")) + .setCharset(StandardCharsets.UTF_8) .build(); marc.iso2709Stream().chunks().forEach(chunk -> { count.incrementAndGet(); diff --git a/src/test/java/org/xbib/marc/dialects/mab/MabTest.java b/src/test/java/org/xbib/marc/dialects/mab/MabTest.java index 9fabc07..ba94c3d 100644 --- a/src/test/java/org/xbib/marc/dialects/mab/MabTest.java +++ b/src/test/java/org/xbib/marc/dialects/mab/MabTest.java @@ -323,7 +323,7 @@ public class MabTest { assertThat(file, CompareMatcher.isIdenticalTo(getClass().getResource(s + "-mapped.xml").openStream())); } - private class LOWTransformer implements MarcValueTransformer { + private static class LOWTransformer implements MarcValueTransformer { @Override public String transform(String value) { diff --git a/src/test/java/org/xbib/marc/dialects/mab/OBVSGTest.java b/src/test/java/org/xbib/marc/dialects/mab/OBVSGTest.java index 487025e..4fe0d57 100644 --- a/src/test/java/org/xbib/marc/dialects/mab/OBVSGTest.java +++ b/src/test/java/org/xbib/marc/dialects/mab/OBVSGTest.java @@ -10,6 +10,7 @@ import org.xbib.marc.label.RecordLabel; import java.io.InputStream; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.concurrent.atomic.AtomicInteger; /** @@ -29,7 +30,7 @@ public class OBVSGTest { try (InputStream in = getClass().getResource(file).openStream()) { Marc marc = Marc.builder() .setInputStream(in) - .setCharset(Charset.forName("UTF-8")) + .setCharset(StandardCharsets.UTF_8) .build(); marc.iso2709Stream().chunks().forEach(chunk -> { count.incrementAndGet(); diff --git a/src/test/java/org/xbib/marc/io/BufferedSeparatorInputStreamTest.java b/src/test/java/org/xbib/marc/io/BufferedSeparatorInputStreamTest.java index d627774..6540240 100644 --- a/src/test/java/org/xbib/marc/io/BufferedSeparatorInputStreamTest.java +++ b/src/test/java/org/xbib/marc/io/BufferedSeparatorInputStreamTest.java @@ -83,7 +83,8 @@ public class BufferedSeparatorInputStreamTest { }; String s = "sequential.groupstream"; InputStream in = getClass().getResource(s).openStream(); - BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in); + BufferedSeparatorInputStream bufferedSeparatorInputStream = + new BufferedSeparatorInputStream(in, 8192); Chunk chunk; while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) { listener.chunk(chunk); @@ -101,7 +102,8 @@ public class BufferedSeparatorInputStreamTest { String s = "sequential.groupstream"; InputStream in = getClass().getResource(s).openStream(); final AtomicInteger count = new AtomicInteger(0); - BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in); + BufferedSeparatorInputStream bufferedSeparatorInputStream = + new BufferedSeparatorInputStream(in, 8192); ChunkListener chunkListener = (chunk) -> count.incrementAndGet(); Chunk chunk; while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) { @@ -117,7 +119,8 @@ public class BufferedSeparatorInputStreamTest { Map map2 = new LinkedHashMap<>(); InputStream in2 = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream(); final AtomicInteger count2 = new AtomicInteger(0); - BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in2); + BufferedSeparatorInputStream bufferedSeparatorInputStream = + new BufferedSeparatorInputStream(in2, 8192); ChunkListener chunkListener2 = (chunk2) -> map2.put(count2.incrementAndGet(), chunk2.data().length()); Chunk chunk2; @@ -151,7 +154,8 @@ public class BufferedSeparatorInputStreamTest { public void testChunkCount() throws Exception { String s = "periouni.mrc"; InputStream in = getClass().getResource("/org/xbib/marc/dialects/unimarc/" + s).openStream(); - BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in); + BufferedSeparatorInputStream bufferedSeparatorInputStream = + new BufferedSeparatorInputStream(in, 8192); long l = bufferedSeparatorInputStream.chunks().count(); assertEquals(192247, l); } @@ -162,7 +166,8 @@ public class BufferedSeparatorInputStreamTest { Map map = new LinkedHashMap<>(); InputStream in = getClass().getResource("/org/xbib/marc/" + s).openStream(); final AtomicInteger count = new AtomicInteger(0); - BufferedSeparatorInputStream bufferedSeparatorInputStream = new BufferedSeparatorInputStream(in); + BufferedSeparatorInputStream bufferedSeparatorInputStream = + new BufferedSeparatorInputStream(in, 8192); ChunkListener chunkListener = (chunk) -> map.put(count.incrementAndGet(), chunk.data().length()); Chunk chunk; @@ -174,5 +179,4 @@ public class BufferedSeparatorInputStreamTest { + "31=2, 32=9, 33=9, 34=2, 35=6, 36=9, 37=0}", map.toString()); in.close(); } - } diff --git a/src/test/java/org/xbib/marc/io/PatternInputStreamTest.java b/src/test/java/org/xbib/marc/io/PatternInputStreamTest.java index 234555c..84f002d 100644 --- a/src/test/java/org/xbib/marc/io/PatternInputStreamTest.java +++ b/src/test/java/org/xbib/marc/io/PatternInputStreamTest.java @@ -38,7 +38,7 @@ public class PatternInputStreamTest { byte[] b = "Hello\nWorld".getBytes(StandardCharsets.UTF_8); Map map = new LinkedHashMap<>(); final AtomicInteger count = new AtomicInteger(0); - PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b)); + PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b), 1024); ChunkListener chunkListener = (chunk) -> map.put(count.incrementAndGet(), chunk.data().length()); Chunk chunk; @@ -55,7 +55,7 @@ public class PatternInputStreamTest { byte[] b = "Hello\r\nWorld".getBytes(StandardCharsets.UTF_8); Map map = new LinkedHashMap<>(); final AtomicInteger count = new AtomicInteger(0); - PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(b)); + PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(b), 1024); ChunkListener chunkListener = (chunk) -> map.put(count.incrementAndGet(), chunk.data().length()); Chunk chunk; @@ -74,7 +74,8 @@ public class PatternInputStreamTest { "Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8) }; for (byte[] b : bytes) { - PatternInputStream separatorStream = PatternInputStream.lf(new ByteArrayInputStream(b)); + PatternInputStream separatorStream = + PatternInputStream.lf(new ByteArrayInputStream(b), 8192); long l = separatorStream.chunks().count(); separatorStream.close(); assertEquals(2L, l); @@ -89,7 +90,8 @@ public class PatternInputStreamTest { } Map map = new LinkedHashMap<>(); final AtomicInteger count = new AtomicInteger(0); - PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes())); + PatternInputStream separatorStream = + PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()), 8192); ChunkListener chunkListener = (chunk) -> map.put(count.incrementAndGet(), chunk.data().length()); Chunk chunk; @@ -108,7 +110,8 @@ public class PatternInputStreamTest { output.write("Hello\r\nWorld\r\n".getBytes(StandardCharsets.UTF_8)); } final AtomicInteger count = new AtomicInteger(0); - PatternInputStream separatorStream = PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes())); + PatternInputStream separatorStream = + PatternInputStream.crlf(new ByteArrayInputStream(output.bytes().toBytes()), 8192); separatorStream.chunks().forEach(chunk -> { count.incrementAndGet(); assertEquals(5, chunk.data().length()); diff --git a/src/test/java/org/xbib/marc/io/UncompressLargeFileTest.java b/src/test/java/org/xbib/marc/io/UncompressLargeFileTest.java new file mode 100644 index 0000000..42ef8d8 --- /dev/null +++ b/src/test/java/org/xbib/marc/io/UncompressLargeFileTest.java @@ -0,0 +1,102 @@ +package org.xbib.marc.io; + +import org.junit.Ignore; +import org.junit.Test; +import org.marc4j.MarcPermissiveStreamReader; +import org.marc4j.MarcReader; +import org.marc4j.marc.Record; +import org.xbib.marc.Marc; +import org.xbib.marc.MarcRecord; +import org.xbib.marc.MarcRecordListener; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.zip.GZIPInputStream; + +@Ignore +public class UncompressLargeFileTest { + + private static final Logger logger = Logger.getLogger(UncompressLargeFileTest.class.getName()); + + @Test + public void uncompress() throws IOException { + InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz")); + GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024); + byte[] buffer = new byte[1024 * 1024]; + int length; + while ((length = gzipInputStream.read(buffer)) != -1) { + // do nothing + } + gzipInputStream.close(); + inputStream.close(); + } + + @Test + public void uncompressAndDecodeChunks() throws Exception { + logger.log(Level.INFO, "start decoding chunks"); + InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz")); + GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024); + final AtomicInteger counter = new AtomicInteger(0); + BufferedSeparatorInputStream bufferedSeparatorInputStream = + new BufferedSeparatorInputStream(gzipInputStream, 1024 * 1024); + ChunkListener chunkListener = (chunk) -> counter.incrementAndGet(); + Chunk chunk; + while ((chunk = bufferedSeparatorInputStream.readChunk()) != null) { + chunkListener.chunk(chunk); + } + gzipInputStream.close(); + logger.log(Level.INFO, "stop decoding chunks, counter = " + counter.get()); + } + + @Test + public void uncompressAndDecodeMarcRecords() throws IOException { + logger.log(Level.INFO, "start decoding MARC"); + InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz")); + GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024); + final AtomicInteger counter = new AtomicInteger(0); + Marc.builder() + .setInputStream(gzipInputStream) + .setMarcRecordListener(new MarcRecordListener() { + @Override + public void beginCollection() { + } + + @Override + public void record(MarcRecord marcRecord) { + counter.incrementAndGet(); + } + + @Override + public void endCollection() { + } + }) + .setCharset(StandardCharsets.UTF_8) + .build() + .writeRecords(1024 * 1024); + gzipInputStream.close(); + inputStream.close(); + logger.log(Level.INFO, "stop deocding MARC, counter = " + counter.get()); + } + + @Test + public void uncompressAndDecodeWithMarc4j() throws Exception { + logger.log(Level.INFO, "start decoding MARC4J"); + InputStream inputStream = Files.newInputStream(Paths.get("/data/zdb/baseline/zdb_dnbmarc_20200309.mrc.gz")); + GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream, 1024 * 1024); + final AtomicInteger counter = new AtomicInteger(0); + MarcReader reader = new MarcPermissiveStreamReader(gzipInputStream, true, true); + while (reader.hasNext()) { + Record record = reader.next(); + counter.incrementAndGet(); + // do nothing + } + gzipInputStream.close(); + inputStream.close(); + logger.log(Level.INFO, "stop deocding MARC4J, counter = " + counter.get()); + } +}