add iterable XML MARCRecord

This commit is contained in:
Jörg Prante 2023-01-23 14:26:47 +01:00
parent 77bd0ac593
commit b5d4913acf
5 changed files with 187 additions and 69 deletions

View file

@ -1,5 +1,5 @@
group = org.xbib group = org.xbib
name = marc name = marc
version = 2.9.14 version = 2.9.15
org.gradle.warning.mode = ALL org.gradle.warning.mode = ALL

View file

@ -125,12 +125,12 @@ public final class Marc {
} }
/** /**
* Run XML stream parser over an XML input stream, with an XML event consumer. * Run XML stream parser over an XML input stream with an XML event consumer.
* @param xmlInputFactory the XML input factory * @param xmlInputFactory the XML input factory
* @param consumer the XML event consumer * @param consumer the XML event consumer
* @throws XMLStreamException if parsing fails * @throws XMLStreamException if parsing fails
*/ */
public void parseEvents(XMLInputFactory xmlInputFactory, MarcXchangeEventConsumer consumer) throws XMLStreamException { public void parse(XMLInputFactory xmlInputFactory, MarcXchangeEventConsumer consumer) throws XMLStreamException {
Objects.requireNonNull(consumer); Objects.requireNonNull(consumer);
if (builder.getMarcListeners() != null) { if (builder.getMarcListeners() != null) {
for (Map.Entry<String, MarcListener> entry : builder.getMarcListeners().entrySet()) { for (Map.Entry<String, MarcListener> entry : builder.getMarcListeners().entrySet()) {
@ -144,6 +144,19 @@ public final class Marc {
xmlEventReader.close(); xmlEventReader.close();
} }
public void parseNextRecord(XMLEventReader xmlEventReader, MarcXchangeEventConsumer consumer) throws XMLStreamException {
Objects.requireNonNull(consumer);
if (builder.getMarcListeners() != null) {
for (Map.Entry<String, MarcListener> entry : builder.getMarcListeners().entrySet()) {
consumer.setMarcListener(entry.getKey(), entry.getValue());
}
}
while (xmlEventReader.hasNext() && !consumer.isEndRecordReached()) {
consumer.add(xmlEventReader.nextEvent());
}
consumer.resetEndRecordReached();
}
public BufferedSeparatorInputStream iso2709Stream() { public BufferedSeparatorInputStream iso2709Stream() {
return iso2709Stream(DEFAULT_BUFFER_SIZE); return iso2709Stream(DEFAULT_BUFFER_SIZE);
} }
@ -575,10 +588,6 @@ public final class Marc {
builder.getMarcListener().endCollection(); builder.getMarcListener().endCollection();
} }
} }
public void parseRecords() throws IOException {
}
} }
/** /**
@ -1221,6 +1230,75 @@ public final class Marc {
return this; return this;
} }
public Iterator<MarcRecord> xmlRecordIterator() {
return xmlRecordIterator(new MarcXchangeEventConsumer());
}
public Iterator<MarcRecord> xmlRecordIterator(MarcXchangeEventConsumer consumer) {
XMLEventReader xmlEventReader;
try {
xmlEventReader = XMLInputFactory.newFactory().createXMLEventReader(inputStream);
} catch (XMLStreamException e) {
throw new IllegalStateException(e);
}
final MarcRecordAdapter marcRecordAdapter = new MarcRecordAdapter(new MarcRecordListener() {
@Override
public void beginCollection() {
}
@Override
public void record(MarcRecord record) {
marcRecord = record;
}
@Override
public void endCollection() {
}
}, Comparator.naturalOrder());
consumer.setMarcListener(marcRecordAdapter);
return new Iterator<>() {
@Override
public boolean hasNext() {
try {
MarcRecord record;
record(null);
while (xmlEventReader.hasNext() && !consumer.isEndRecordReached()) {
consumer.add(xmlEventReader.nextEvent());
}
consumer.resetEndRecordReached();
record = getMarcRecord();
if (record != null) {
return true;
}
} catch (XMLStreamException e) {
throw new IllegalStateException(e);
}
return false;
}
@Override
public MarcRecord next() {
MarcRecord record = getMarcRecord();
if (record == null) {
throw new NoSuchElementException();
}
return record;
}
};
}
/**
* For easy {@code for} statements.
* @return iterable
*/
public Iterable<MarcRecord> xmlIterable() {
return this::xmlRecordIterator;
}
public Stream<MarcRecord> xmlRecordStream() {
return StreamSupport.stream(xmlIterable().spliterator(), false);
}
private MarcRecord getMarcRecord() { private MarcRecord getMarcRecord() {
return marcRecord; return marcRecord;
} }

View file

@ -18,6 +18,7 @@ package org.xbib.marc;
import org.xbib.marc.label.RecordLabel; import org.xbib.marc.label.RecordLabel;
import java.util.Comparator; import java.util.Comparator;
import java.util.Objects;
/** /**
* The Marc record adapter collects Marc field events, collects them in a Marc builder, * The Marc record adapter collects Marc field events, collects them in a Marc builder,
@ -33,6 +34,7 @@ public class MarcRecordAdapter implements MarcListener {
private final Comparator<String> comparator; private final Comparator<String> comparator;
public MarcRecordAdapter(MarcRecordListener marcRecordListener, Comparator<String> comparator) { public MarcRecordAdapter(MarcRecordListener marcRecordListener, Comparator<String> comparator) {
Objects.requireNonNull(marcRecordListener);
this.marcRecordListener = marcRecordListener; this.marcRecordListener = marcRecordListener;
this.builder = Marc.builder(); this.builder = Marc.builder();
this.comparator = comparator; this.comparator = comparator;

View file

@ -45,13 +45,13 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo
private final Deque<MarcField.Builder> stack; private final Deque<MarcField.Builder> stack;
private final Map<String, MarcListener> listeners; private final Map<String, MarcListener> marcListeners;
private final StringBuilder content;
private MarcValueTransformers marcValueTransformers; private MarcValueTransformers marcValueTransformers;
private MarcListener listener; private MarcListener marcListener;
private final StringBuilder content;
private String format; private String format;
@ -59,23 +59,26 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo
private final Set<String> validNamespaces; private final Set<String> validNamespaces;
private boolean endRecordReached;
public MarcXchangeEventConsumer() { public MarcXchangeEventConsumer() {
this.stack = new LinkedList<>(); this.stack = new LinkedList<>();
this.listeners = new HashMap<>(); this.marcListeners = new HashMap<>();
this.content = new StringBuilder(); this.content = new StringBuilder();
this.format = MARC21_FORMAT; this.format = MARC21_FORMAT;
this.type = BIBLIOGRAPHIC_TYPE; this.type = BIBLIOGRAPHIC_TYPE;
this.validNamespaces = new HashSet<>(); this.validNamespaces = new HashSet<>();
this.validNamespaces.addAll(Set.of(MARCXCHANGE_V1_NS_URI, MARCXCHANGE_V2_NS_URI, MARC21_SCHEMA_URI)); this.validNamespaces.addAll(Set.of(MARCXCHANGE_V1_NS_URI, MARCXCHANGE_V2_NS_URI, MARC21_SCHEMA_URI));
this.endRecordReached = false;
} }
public MarcXchangeEventConsumer setMarcListener(String type, MarcListener listener) { public MarcXchangeEventConsumer setMarcListener(String type, MarcListener listener) {
this.listeners.put(type, listener); this.marcListeners.put(type, listener);
return this; return this;
} }
public MarcXchangeEventConsumer setMarcListener(MarcListener listener) { public MarcXchangeEventConsumer setMarcListener(MarcListener listener) {
this.listeners.put(BIBLIOGRAPHIC_TYPE, listener); this.marcListeners.put(BIBLIOGRAPHIC_TYPE, listener);
return this; return this;
} }
@ -91,37 +94,38 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo
@Override @Override
public void beginCollection() { public void beginCollection() {
if (listener != null) { if (marcListener != null) {
listener.beginCollection(); marcListener.beginCollection();
} }
} }
@Override @Override
public void endCollection() { public void endCollection() {
if (listener != null) { if (marcListener != null) {
listener.endCollection(); marcListener.endCollection();
} }
} }
@Override @Override
public void beginRecord(String format, String type) { public void beginRecord(String format, String type) {
this.listener = listeners.get(type != null ? type : BIBLIOGRAPHIC_TYPE); this.marcListener = marcListeners.get(type != null ? type : BIBLIOGRAPHIC_TYPE);
if (listener != null) { if (marcListener != null) {
listener.beginRecord(format, type); marcListener.beginRecord(format, type);
} }
} }
@Override @Override
public void endRecord() { public void endRecord() {
if (listener != null) { if (marcListener != null) {
listener.endRecord(); marcListener.endRecord();
} }
this.endRecordReached = true;
} }
@Override @Override
public void leader(RecordLabel label) { public void leader(RecordLabel label) {
if (listener != null) { if (marcListener != null) {
listener.leader(label); marcListener.leader(label);
} }
} }
@ -131,8 +135,8 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo
if (marcValueTransformers != null) { if (marcValueTransformers != null) {
field = marcValueTransformers.transformValue(field); field = marcValueTransformers.transformValue(field);
} }
if (listener != null) { if (marcListener != null) {
listener.field(field); marcListener.field(field);
} }
} }
@ -141,7 +145,7 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo
if (event.isStartElement()) { if (event.isStartElement()) {
StartElement element = (StartElement) event; StartElement element = (StartElement) event;
String uri = element.getName().getNamespaceURI(); String uri = element.getName().getNamespaceURI();
if (!isNamespace(uri)) { if (!validNamespaces.contains(uri)) {
return; return;
} }
String localName = element.getName().getLocalPart(); String localName = element.getName().getLocalPart();
@ -191,69 +195,57 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo
} }
content.setLength(0); content.setLength(0);
switch (localName) { switch (localName) {
case COLLECTION: { case COLLECTION -> {
beginCollection(); beginCollection();
break;
} }
case RECORD: { case RECORD -> {
setFormat(thisformat); setFormat(thisformat);
setType(thistype); setType(thistype);
beginRecord(thisformat, thistype); beginRecord(thisformat, thistype);
break;
} }
case LEADER: { case LEADER -> {
break;
} }
case CONTROLFIELD: case CONTROLFIELD, DATAFIELD -> {
case DATAFIELD: {
MarcField.Builder builder = MarcField.builder().tag(tag); MarcField.Builder builder = MarcField.builder().tag(tag);
if (max > 0) { if (max > 0) {
builder.indicator(sb.substring(min - 1, max)); builder.indicator(sb.substring(min - 1, max));
} }
stack.push(builder); stack.push(builder);
break;
} }
case SUBFIELD: { case SUBFIELD -> {
stack.peek().subfield(code, null); stack.peek().subfield(code, null);
break;
} }
default: default -> {
break; }
} }
} else if (event.isEndElement()) { } else if (event.isEndElement()) {
EndElement element = (EndElement) event; EndElement element = (EndElement) event;
String uri = element.getName().getNamespaceURI(); String uri = element.getName().getNamespaceURI();
if (!isNamespace(uri)) { if (!validNamespaces.contains(uri)) {
return; return;
} }
String localName = element.getName().getLocalPart(); String localName = element.getName().getLocalPart();
switch (localName) { switch (localName) {
case COLLECTION: { case COLLECTION -> {
endCollection(); endCollection();
break;
} }
case RECORD: { case RECORD -> {
endRecord(); endRecord();
break;
} }
case LEADER: { case LEADER -> {
leader(RecordLabel.builder().from(content.toString().toCharArray()).build()); leader(RecordLabel.builder().from(content.toString().toCharArray()).build());
break;
} }
case CONTROLFIELD: { case CONTROLFIELD -> {
field(transformValue(stack.pop().value(content.toString()).build())); field(transformValue(stack.pop().value(content.toString()).build()));
break;
} }
case DATAFIELD: { case DATAFIELD -> {
field(transformValue(stack.pop().build())); field(transformValue(stack.pop().build()));
break;
} }
case SUBFIELD: { case SUBFIELD -> {
stack.peek().subfieldValue(content.toString()); stack.peek().subfieldValue(content.toString());
break;
} }
default: default -> {
break; }
} }
content.setLength(0); content.setLength(0);
} else if (event.isCharacters()) { } else if (event.isCharacters()) {
@ -285,8 +277,12 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo
return this; return this;
} }
private boolean isNamespace(String uri) { public boolean isEndRecordReached() {
return validNamespaces.contains(uri); return endRecordReached;
}
public void resetEndRecordReached() {
endRecordReached = false;
} }
private MarcField transformValue(MarcField field) { private MarcField transformValue(MarcField field) {

View file

@ -16,13 +16,19 @@
package org.xbib.marc.xml; package org.xbib.marc.xml;
import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertNull;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.xbib.marc.Marc; import org.xbib.marc.Marc;
import org.xbib.marc.MarcRecord;
import org.xmlunit.matchers.CompareMatcher; import org.xmlunit.matchers.CompareMatcher;
import java.io.InputStream; import java.io.InputStream;
import java.io.StringWriter; import java.io.StringWriter;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLInputFactory;
/** /**
@ -30,6 +36,8 @@ import javax.xml.stream.XMLInputFactory;
*/ */
public class MarcEventConsumerTest { public class MarcEventConsumerTest {
private static final Logger logger = Logger.getLogger(MarcEventConsumerTest.class.getName());
/** /**
* Parsing XML by STAX (streaming XML) from Aleph publishing interface (hbz dialect). * Parsing XML by STAX (streaming XML) from Aleph publishing interface (hbz dialect).
* *
@ -58,7 +66,7 @@ public class MarcEventConsumerTest {
.setFormat("AlephXML") .setFormat("AlephXML")
.setType("Bibliographic") .setType("Bibliographic")
.build() .build()
.parseEvents(XMLInputFactory.newFactory(), consumer); .parse(XMLInputFactory.newFactory(), consumer);
writer.endCollection(); writer.endCollection();
writer.endDocument(); writer.endDocument();
sw.close(); sw.close();
@ -72,7 +80,7 @@ public class MarcEventConsumerTest {
InputStream in = getClass().getResourceAsStream(s); InputStream in = getClass().getResourceAsStream(s);
MarcXchangeEventConsumer consumer = new MarcXchangeEventConsumer(); MarcXchangeEventConsumer consumer = new MarcXchangeEventConsumer();
consumer.addNamespace("http://www.ddb.de/professionell/mabxml/mabxml-1.xsd"); consumer.addNamespace("http://www.ddb.de/professionell/mabxml/mabxml-1.xsd");
MarcXchangeWriter writer = new MarcXchangeWriter(consumer); try (MarcXchangeWriter writer = new MarcXchangeWriter(consumer)) {
writer.setFormat("AlephXML").setType("Bibliographic"); writer.setFormat("AlephXML").setType("Bibliographic");
writer.startDocument(); writer.startDocument();
Marc.builder() Marc.builder()
@ -85,4 +93,38 @@ public class MarcEventConsumerTest {
writer.endDocument(); writer.endDocument();
assertNull(writer.getException()); assertNull(writer.getException());
} }
}
@Test
public void testXmlIterable() {
String s = "chabon.mrc.xml";
InputStream in = getClass().getResourceAsStream(s);
AtomicInteger count = new AtomicInteger();
for (MarcRecord marcRecord : Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.xmlIterable()) {
logger.log(Level.INFO, marcRecord.toString());
count.incrementAndGet();
}
assertEquals(2, count.get());
}
@Test
public void testXmlIterator() {
String s = "HT016424175.xml";
InputStream in = getClass().getResourceAsStream(s);
MarcXchangeEventConsumer consumer = new MarcXchangeEventConsumer();
consumer.addNamespace("http://www.ddb.de/professionell/mabxml/mabxml-1.xsd");
Iterator<MarcRecord> iterator = Marc.builder()
.setInputStream(in)
.setCharset(StandardCharsets.UTF_8)
.xmlRecordIterator(consumer);
AtomicInteger count = new AtomicInteger();
while (iterator.hasNext()) {
logger.log(Level.INFO, iterator.next().toString());
count.incrementAndGet();
}
assertEquals(1, count.get());
}
} }