From ccdb458698cc504d8bac85875f5d2d0c75ab0e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Prante?= Date: Mon, 23 Jan 2023 17:51:31 +0100 Subject: [PATCH] fix allowed subfield IDs, add SRU hack for number of records, add MarcRecordIterator class --- src/main/java/org/xbib/marc/Marc.java | 11 +- .../org/xbib/marc/MarcRecordIterator.java | 8 + .../xbib/marc/StrictMarcFieldValidator.java | 15 +- .../marc/xml/MarcXchangeEventConsumer.java | 22 +- .../java/org/xbib/marc/MarcFieldTest.java | 43 +- .../xbib/marc/xml/MarcEventConsumerTest.java | 20 + src/test/resources/org/xbib/marc/xml/lvi.xml | 567 ++++++++++++++++++ 7 files changed, 662 insertions(+), 24 deletions(-) create mode 100644 src/main/java/org/xbib/marc/MarcRecordIterator.java create mode 100644 src/test/resources/org/xbib/marc/xml/lvi.xml diff --git a/src/main/java/org/xbib/marc/Marc.java b/src/main/java/org/xbib/marc/Marc.java index bede9b0..1db9d0d 100644 --- a/src/main/java/org/xbib/marc/Marc.java +++ b/src/main/java/org/xbib/marc/Marc.java @@ -1230,11 +1230,11 @@ public final class Marc { return this; } - public Iterator xmlRecordIterator() { + public MarcRecordIterator xmlRecordIterator() { return xmlRecordIterator(new MarcXchangeEventConsumer()); } - public Iterator xmlRecordIterator(MarcXchangeEventConsumer consumer) { + public MarcRecordIterator xmlRecordIterator(MarcXchangeEventConsumer consumer) { XMLEventReader xmlEventReader; try { xmlEventReader = XMLInputFactory.newFactory().createXMLEventReader(inputStream); @@ -1256,7 +1256,12 @@ public final class Marc { } }, Comparator.naturalOrder()); consumer.setMarcListener(marcRecordAdapter); - return new Iterator<>() { + return new MarcRecordIterator() { + @Override + public long getTotalNumberOfRecords() { + return consumer.getNumberOfRecords(); + } + @Override public boolean hasNext() { try { diff --git a/src/main/java/org/xbib/marc/MarcRecordIterator.java b/src/main/java/org/xbib/marc/MarcRecordIterator.java new file mode 100644 index 0000000..484d5d5 --- /dev/null +++ b/src/main/java/org/xbib/marc/MarcRecordIterator.java @@ -0,0 +1,8 @@ +package org.xbib.marc; + +import java.util.Iterator; + +public interface MarcRecordIterator extends Iterator { + + long getTotalNumberOfRecords(); +} diff --git a/src/main/java/org/xbib/marc/StrictMarcFieldValidator.java b/src/main/java/org/xbib/marc/StrictMarcFieldValidator.java index fd8ff9d..08a35a0 100644 --- a/src/main/java/org/xbib/marc/StrictMarcFieldValidator.java +++ b/src/main/java/org/xbib/marc/StrictMarcFieldValidator.java @@ -19,6 +19,9 @@ import java.util.Set; public class StrictMarcFieldValidator implements MarcFieldValidator { + /** + * See MARC variables in fields. + */ private static final Set ASCII_GRAPHICS = Set.of( '\u0020', '\u0021', '\u0022', '\u0023', '\u0024', '\u0025', '\u0026', '\'', '\u0028', '\u0029', '\u002A', '\u002B', '\u002C', '\u002D', '\u002E', '\u002F', @@ -34,8 +37,6 @@ public class StrictMarcFieldValidator implements MarcFieldValidator { '\u007B', '\u007C', '\u007D', '\u007E' ); - private static final char BLANK = ' '; - private static final String BLANK_STRING = " "; private static final String BLANK_TAG = " "; @@ -82,11 +83,13 @@ public class StrictMarcFieldValidator implements MarcFieldValidator { // we do not allow an empty subfield id. Elasticsearch field names require a length > 0. if (id.isEmpty()) { id = BLANK_STRING; - } else { - // We have inconsistent use of subfield id symbols as placeholders for a "blank space" - // and we need to fix it here for consistency. - id = id.replaceAll("[-#.^_]", BLANK_STRING); } + // we do not allow characters that are not in the graphics definition + if (id.length() == 1 && !ASCII_GRAPHICS.contains(id.charAt(0))) { + id = BLANK_STRING; + } + // sorry, but we must disallow . because of Elasticsearch. + id = id.replaceAll("\\.", BLANK_STRING); } return id; } diff --git a/src/main/java/org/xbib/marc/xml/MarcXchangeEventConsumer.java b/src/main/java/org/xbib/marc/xml/MarcXchangeEventConsumer.java index 91db10b..4daadfc 100644 --- a/src/main/java/org/xbib/marc/xml/MarcXchangeEventConsumer.java +++ b/src/main/java/org/xbib/marc/xml/MarcXchangeEventConsumer.java @@ -61,6 +61,8 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo private boolean endRecordReached; + private long numberOfRecords; + public MarcXchangeEventConsumer() { this.stack = new LinkedList<>(); this.marcListeners = new HashMap<>(); @@ -70,6 +72,7 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo this.validNamespaces = new HashSet<>(); this.validNamespaces.addAll(Set.of(MARCXCHANGE_V1_NS_URI, MARCXCHANGE_V2_NS_URI, MARC21_SCHEMA_URI)); this.endRecordReached = false; + this.numberOfRecords = -1L; } public MarcXchangeEventConsumer setMarcListener(String type, MarcListener listener) { @@ -143,6 +146,7 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo @Override public void add(XMLEvent event) throws XMLStreamException { if (event.isStartElement()) { + content.setLength(0); StartElement element = (StartElement) event; String uri = element.getName().getNamespaceURI(); if (!validNamespaces.contains(uri)) { @@ -193,7 +197,6 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo if (thistype == null) { thistype = this.type; } - content.setLength(0); switch (localName) { case COLLECTION -> { beginCollection(); @@ -220,11 +223,18 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo } } else if (event.isEndElement()) { EndElement element = (EndElement) event; - String uri = element.getName().getNamespaceURI(); - if (!validNamespaces.contains(uri)) { + // hack for SRU numberOfRecords + String localName = element.getName().getLocalPart(); + if ("numberOfRecords".equals(localName)) { + try { + this.numberOfRecords = Long.parseLong(content.toString()); + } catch (NumberFormatException e) { + // ignore + } + } + if (!validNamespaces.contains(element.getName().getNamespaceURI())) { return; } - String localName = element.getName().getLocalPart(); switch (localName) { case COLLECTION -> { endCollection(); @@ -285,6 +295,10 @@ public class MarcXchangeEventConsumer implements XMLEventConsumer, MarcXchangeCo endRecordReached = false; } + public long getNumberOfRecords() { + return numberOfRecords; + } + private MarcField transformValue(MarcField field) { return marcValueTransformers != null ? marcValueTransformers.transformValue(field) : field; } diff --git a/src/test/java/org/xbib/marc/MarcFieldTest.java b/src/test/java/org/xbib/marc/MarcFieldTest.java index 3e13477..721680d 100644 --- a/src/test/java/org/xbib/marc/MarcFieldTest.java +++ b/src/test/java/org/xbib/marc/MarcFieldTest.java @@ -20,6 +20,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import org.junit.jupiter.api.Test; +import java.util.Set; import java.util.TreeSet; public class MarcFieldTest { @@ -182,16 +183,6 @@ public class MarcFieldTest { assertFalse(marcField.isIndicatorValid()); } - @Test - public void testInvalidSubfield() { - MarcField marcField = MarcField.builder() - .tag("100") - .indicator("0") - .subfield("\u007f", null) - .build(); - assertFalse(marcField.areAllSubfieldsValid()); - } - // 901 =, 901 a=98502599, 901 d=0, 901 e=14, 901 =f, 901 =h] @Test public void testBeginEndFields() { @@ -261,10 +252,40 @@ public class MarcFieldTest { assertTrue(marcField.isControl()); assertEquals("001", marcField.getTag()); assertEquals(" ", marcField.getIndicator()); - assertEquals("123", marcField.getFirstSubfieldValue(" ")); + assertEquals("123", marcField.getFirstSubfieldValue("_")); // _ is allowed! marcField = MarcField.builder().key("001", "\\.", "123").build(); assertTrue(marcField.isControl()); assertEquals("001", marcField.getTag()); assertEquals("123", marcField.getValue()); } + + @Test + public void testMarcSubfieldIds() { + for (Character ch : ASCII_GRAPHICS) { + if (ch == '.') { + ch = ' '; // special rule because of Elasticsearch + } + MarcField marcField = MarcField.builder() + .tag("100") + .indicator(" ") + .subfield(Character.toString(ch), "Hello World " + ch) + .build(); + assertEquals("Hello World " + ch, marcField.getSubfield(Character.toString(ch)).getFirst().getValue()); + } + } + + private static final Set ASCII_GRAPHICS = Set.of( + '\u0020', '\u0021', '\u0022', '\u0023', '\u0024', '\u0025', '\u0026', '\'', + '\u0028', '\u0029', '\u002A', '\u002B', '\u002C', '\u002D', '\u002E', '\u002F', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + '\u003A', '\u003B', '\u003C', '\u003D', '\u003E', '\u003F', '\u0040', + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', + 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', + 'U', 'V', 'W', 'X', 'Y', 'Z', + '\u005B', '\\', '\u005D', '\u005E', '\u005F', '\u0060', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', + 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', + 'u', 'v', 'w', 'x', 'y', 'z', + '\u007B', '\u007C', '\u007D', '\u007E' + ); } diff --git a/src/test/java/org/xbib/marc/xml/MarcEventConsumerTest.java b/src/test/java/org/xbib/marc/xml/MarcEventConsumerTest.java index e0df370..8114b1e 100644 --- a/src/test/java/org/xbib/marc/xml/MarcEventConsumerTest.java +++ b/src/test/java/org/xbib/marc/xml/MarcEventConsumerTest.java @@ -21,6 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import org.junit.jupiter.api.Test; import org.xbib.marc.Marc; import org.xbib.marc.MarcRecord; +import org.xbib.marc.MarcRecordIterator; import org.xmlunit.matchers.CompareMatcher; import java.io.InputStream; import java.io.StringWriter; @@ -127,4 +128,23 @@ public class MarcEventConsumerTest { } assertEquals(1, count.get()); } + + @Test + public void testSRUXMLIterable() { + String s = "lvi.xml"; + InputStream in = getClass().getResourceAsStream(s); + AtomicInteger count = new AtomicInteger(); + MarcXchangeEventConsumer consumer = new MarcXchangeEventConsumer(); + MarcRecordIterator iterator = Marc.builder() + .setInputStream(in) + .setCharset(StandardCharsets.UTF_8) + .xmlRecordIterator(consumer); + while (iterator.hasNext()) { + MarcRecord marcRecord = iterator.next(); + logger.log(Level.INFO, marcRecord.toString()); + count.incrementAndGet(); + } + assertEquals(5, count.get()); + assertEquals(5L, iterator.getTotalNumberOfRecords()); + } } diff --git a/src/test/resources/org/xbib/marc/xml/lvi.xml b/src/test/resources/org/xbib/marc/xml/lvi.xml new file mode 100644 index 0000000..088dee3 --- /dev/null +++ b/src/test/resources/org/xbib/marc/xml/lvi.xml @@ -0,0 +1,567 @@ + +1.15xml + 00000nam a2200373 cb4500 + HT014020783 + DE-605 + 00000000000000.0 + t + 040517s1982 gw ad|| |||| 00||| ger d + + 3506374419 + 3-506-37441-9 + + + Best.-Nr. 37441 + + + (OCoLC)74557806 + + + (DE-599)HBZHT014020783 + + + DE-605 + ger + rakwb + + + ger + + + gw + DE + + + Informatik + Manfred Prante ; Wolfgang Tofahrn + + + Schulbuch Schöningh + + + [Nachdr.] + + + Paderborn + Schöningh + [19]82 + + + 240 S. + Ill., graph. Darst. + + + (DE-588)4053369-4 + Schüler + gnd + + + Schöninghbuch + 37441 + + + Informatik + (DE-588)4026894-9 + gnd + + + (DE-588)4053458-3 + Schulbuch + gnd-content + + + DE-605 + + + Informatik + (DE-588)4026894-9 + s + + + Prante, Manfred + Sonstige + oth + + + Tofahrn, Wolfgang + Sonstige + oth + + + Schöninghbuch + 37441 + (DE-605)HT003053914 + + + 466 + DE-466 + NRW + c + keine Angabe + TSD4165 + + + 5 + DE-5 + NRW + d + keine ILL + PRA + DE-5-14 + 00070015 + + + 466 + DE-466 + NRW + 30 + 40 + TSD4165 + + + (DE-605)HBZ01-012633043 + + 1xml + 00000nam a2200541 c 4500 + BT000072798 + DE-605 + 19960816 + t + 960816s1987 ab|| |||| 00||| und d + + 3883395900 + 3-88339-590-0 + + + (OCoLC)1069846308 + + + (DE-599)HBZBT000072798 + + + DE-605 + ger + rakwb + + + und + + + Grafschaft Mark + nwbib + + + Bergbaugeschichte + nwbib + + + Bader, Karl Heinz + Verfasser + aut + + + ˜250œ [Zweihundertfünfzig] Jahre märkischer Steinkohlenbergbau + e. Beitr. zur Geschichte d. Bergbaus, d. Bergverwaltung u. d. Stadt Bochum + unter Mitarb. von Manfred Prante. + + + 250 Jahre märkischer Steinkohlenbergbau + + + Bochum + Studienverl. Brockmeyer + 1987 + + + 232 S. + Ill., Kt. + + + Bochum, Bergbaugeschichte + gnd + + + Mark (Grafschaft), Bergbaugeschichte + gnd + + + Steinkohlenbergbau, Bochum + gnd + + + Bergbaugeschichte, Bochum + gnd + + + Steinkohlenbergbau, Mark (Grafschaft) + gnd + + + Bochum, Steinkohlenbergbau + gnd + + + Mark (Grafschaft), Steinkohlenbergbau + gnd + + + DE-605 + + + Mark (Grafschaft), Bergbaugeschichte + + + + DE-605 + + + Bochum, Bergbaugeschichte + + + + DE-605 + + + Mark (Grafschaft), Steinkohlenbergbau + + + + DE-605 + + + Bochum, Steinkohlenbergbau + + + + DE-605 + + + Bergbaugeschichte, Bochum + + + + DE-605 + + + Steinkohlenbergbau, Mark (Grafschaft) + + + + DE-605 + + + Steinkohlenbergbau, Bochum + + + + Röttger, Karl + Verfasser + aut + + + Prante, Manfred + Sonstige + oth + + + Sol 1 + DE-Sol1 + NRW + d + keine ILL + MA 3072 + + + (DE-605)HBZ01-011119359 + + + NWBib + + 2xml + 00000nam a2200481 cb4500 + HT000319070 + DE-605 + 20100302 + t + 990714s1978 gw ad|| |||| 00||| ger d + + 3506374419 + 3-506-37441-9 + + + (OCoLC)256357573 + + + (DE-599)HBZHT000319070 + + + DE-605 + ger + rakwb + + + ger + + + gw + DE + + + 120 + rpb + + + 107 + rpb + + + 820 + rpb + + + Informatik + Manfred Prante ; Wolfgang Tofahrn + + + Schulbuch Schöningh + + + 1. Dr. + + + Paderborn + Schöningh + 1978 + + + 240 S. + Ill., graph. Darst. + + + (DE-588)4053369-4 + Schüler + gnd + + + Schöninghbuch + 37441 + + + Informatik + (DE-588)4026894-9 + gnd + + + (DE-588)4053458-3 + Schulbuch + gnd-content + + + DE-605 + + + Informatik + (DE-588)4026894-9 + s + + + Prante, Manfred + Sonstige + oth + + + Tofahrn, Wolfgang + Sonstige + oth + + + Schöninghbuch + 37441 + (DE-605)HT003053914 + + + 121 + DE-121 + NRW + c + 10 + 80 A 1963 + + + 6 + DE-6 + NRW + c + keine Angabe + 2D 617 + + + 708 + DE-708 + NRW + c + Keine Angabe + TQF/PRA + + + 82 + DE-82 + NRW + c + keine Angabe + BF7065 + + + 361 + DE-361 + NRW + c + ILL Kopie+Ausl. + HI100 P899 + 10I_M + + + 465 + DE-465 + NRW + b + Präsenzbestand + QKS1721_d + D94 + + + 465 + DE-465 + NRW + c + Ausleihbestand + QKS1721_d + D95 + + + 929 + DE-929 + NRW + c + ILL Ausleihe + 2010/931 + 00000000 + + + 38 + DE-38 + NRW + d + keine ILL + 315/4H85 + DE-38-315 + 00315001 + + + 361 + DE-361 + NRW + c + item_08 + HI100 P899 + 10I_Mono + + + 708 + DE-708 + NRW + c + 10 + 00 + TQF/PRA + + + 82 + DE-82 + NRW + Standard + BF7065 + MAGAZIN_4 + + + (DE-605)HBZ01-005825501 + + 3xml + 00000nam a2200000uu 4500 + (DE-E15)000003552 + DE-E15 + 20220523000000.0 + 220523s1987 gw |||| |||| 00||| ger|c + + DE-E15 + ger + DE-605 + + + Bader, Karl-Heinz + + + 250 Jahre märkischer Steinkohlenbergbau + Ein Beitrag zur Geschichte des Bergbaus, der Bergverwaltung und der Stadt Bochum + Karl-Heinz Bader ; Karl Röttger. Unter Mitarbeit von Manfred Prante + + + Bochum + Brockmeyer + 1987 + + + 232 S. + Ill., graph. Darst. + + + Prante, Manfred [Mitarb.] + + + Röttger, Karl + + + DE-E15 + NRW + + 4xml + 00000nam a2200000uu 4500 + (DE-131)0358674 + DE-131 + 20220519000000.0 + 220519s1978 gw |||| |||| 00||| ger|c + + 3506374419 + + + DE-131 + ger + DE-605 + + + Prante, Manfred + + + Informatik + Manfred Prante ; Wolfgang Tofahrn + + + Paderborn + Schöningh + 1978 + + + 240 S. + zahlr. graph. Darst. + + + Auf d. Rücken: Prante-Tofahrn. + + + Tofahrn, Wolfgang + + + DE-131 + NRW + + 5 \ No newline at end of file