make replace faster in strict field validation, remove spurious replaces

This commit is contained in:
Jörg Prante 2022-11-04 19:58:30 +01:00
parent 9950003ea7
commit 6ae3134827
6 changed files with 67 additions and 101 deletions

View file

@ -47,15 +47,13 @@ public class StrictMarcFieldValidator implements MarcFieldValidator {
public String validateTag(String tagCandidate) {
String tag = tagCandidate;
if (tag != null) {
// We have inconsistent use of tag symbols as placeholders for a "blank space"
// and we need to fix it here for consistency.
tag = tag.replace('-', BLANK)
.replace('#', BLANK)
.replace('.', BLANK)
.replace('_', BLANK);
// do not allow empty tags
if (tag.isEmpty()) {
tag = BLANK_TAG;
} else {
// We have inconsistent use of tag symbols as placeholders for a "blank space"
// and we need to fix it here for consistency.
tag = tag.replaceAll("[-#.^_]", BLANK_STRING);
}
}
return tag;
@ -65,16 +63,13 @@ public class StrictMarcFieldValidator implements MarcFieldValidator {
public String validateIndicator(String indicatorCandidate) {
String indicator = indicatorCandidate;
if (indicator != null) {
// We have inconsistent use of indicator symbols as placeholders for a "blank space"
// and we need to fix it here for consistency.
indicator = indicator
.replace('-', BLANK)
.replace('#', BLANK)
.replace('.', BLANK)
.replace('_', BLANK);
// we do not allow an empty indicator. Elasticsearch field names require a length > 0.
if (indicator.isEmpty()) {
indicator = BLANK_STRING;
} else {
// We have inconsistent use of indicator symbols as placeholders for a "blank space"
// and we need to fix it here for consistency.
indicator = indicator.replaceAll("[-#.^_]", BLANK_STRING);
}
}
return indicator;
@ -84,15 +79,13 @@ public class StrictMarcFieldValidator implements MarcFieldValidator {
public String validateSubfieldId(String subfieldIdCandidate) {
String id = subfieldIdCandidate;
if (id != null) {
// We have inconsistent use of subfield id symbols as placeholders for a "blank space"
// and we need to fix it here for consistency.
id = id.replace('-', BLANK)
.replace('#', BLANK)
.replace('.', BLANK)
.replace('_', BLANK);
// we do not allow an empty subfield id. Elasticsearch field names require a length > 0.
if (id.isEmpty()) {
id = BLANK_STRING;
} else {
// We have inconsistent use of subfield id symbols as placeholders for a "blank space"
// and we need to fix it here for consistency.
id = id.replaceAll("[-#.^_]", BLANK_STRING);
}
}
return id;

View file

@ -73,7 +73,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
private boolean fatalErrors;
private EnumSet<Style> style;
private final EnumSet<Style> style;
private Exception exception;
@ -83,7 +83,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
private int splitlimit;
private int bufferSize;
private final int bufferSize;
private boolean compress;
@ -143,44 +143,6 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
newWriter(fileNamePattern, fileNameCounter, bufferSize, compress);
}
private static String escape(String value) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < value.length(); i++) {
char c = value.charAt(i);
switch (c) {
case '"':
sb.append("\\\"");
break;
case '\\':
sb.append("\\\\");
break;
case '\b':
sb.append("\\b");
break;
case '\f':
sb.append("\\f");
break;
case '\n':
sb.append("\\n");
break;
case '\r':
sb.append("\\r");
break;
case '\t':
sb.append("\\t");
break;
default:
if (c < 0x1f) {
sb.append("\\u").append(String.format("%04x", (int) c));
} else {
sb.append(c);
}
break;
}
}
return sb.toString();
}
public MarcJsonWriter setIndex(String index, String indexType) {
this.index = index;
this.indexType = indexType;
@ -375,7 +337,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
jsonWriter.writeArrayClose();
} else {
jsonWriter.writeObjectOpen();
jsonWriter.writeMemberName(marcField.getIndicator().replace(' ', '_'));
jsonWriter.writeMemberName(marcField.getIndicator());
jsonWriter.writeMemberSeparator();
jsonWriter.writeArrayOpen();
boolean subfieldseparator = false;
@ -399,7 +361,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
}
/**
* Write MARC record from underlying map as key-oriented JSON.
* Write MARC record from underlying map as key-oriented JSON. Use repeat maps to create lists.
* @param marcRecord the MARC record
* @throws IOException if writing fails
*/
@ -583,15 +545,16 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
jsonWriter = new JsonWriter(writer);
}
@SuppressWarnings("unchecked")
private void writeMetaDataLine(MarcRecord marcRecord) {
String id;
Object object = marcRecord.get("001");
// step down to indicator/subfield ID levels if possible, get first value, assuming single field/value in 001
if (object instanceof Map) {
object = ((Map) object).values().iterator().next();
object = ((Map<String, Object>) object).values().iterator().next();
}
if (object instanceof Map) {
object = ((Map) object).values().iterator().next();
object = ((Map<String, Object>) object).values().iterator().next();
}
id = object.toString();
if (index != null && indexType != null && id != null) {
@ -614,6 +577,44 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
}
}
private static String escape(String value) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < value.length(); i++) {
char c = value.charAt(i);
switch (c) {
case '"':
sb.append("\\\"");
break;
case '\\':
sb.append("\\\\");
break;
case '\b':
sb.append("\\b");
break;
case '\f':
sb.append("\\f");
break;
case '\n':
sb.append("\\n");
break;
case '\r':
sb.append("\\r");
break;
case '\t':
sb.append("\\t");
break;
default:
if (c < 0x1f) {
sb.append("\\u").append(String.format("%04x", (int) c));
} else {
sb.append(c);
}
break;
}
}
return sb.toString();
}
/**
*
*/

View file

@ -133,19 +133,7 @@ public class InverseMarcContentHandler implements MarcListener, MarcXchangeConst
attrs.addAttribute(EMPTY_STRING, TAG_ATTRIBUTE, prefix(TAG_ATTRIBUTE), CDATA, field.getTag());
contentHandler.startElement(nsUri, CONTROLFIELD, prefix(CONTROLFIELD), attrs);
String value = field.getValue();
if (value != null && !value.isEmpty()) {
switch (field.getTag()) {
case "006":
case "007":
case "008":
// fix wrong fill characters here
value = value.replace('^', '|');
break;
default:
break;
}
contentHandler.characters(value.toCharArray(), 0, value.length());
}
contentHandler.endElement(nsUri, CONTROLFIELD, prefix(CONTROLFIELD));
} else {
String tag = field.getTag();

View file

@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
@ -91,33 +92,17 @@ public class StreamMatcher {
}
}
public static void compareTwoXmls(Class<?> cl, String resourceName,
String suffix1, Producer producer1,
String suffix2, Producer producer2) throws IOException {
Path path1 = Files.createTempFile(resourceName, suffix1);
Path path2 = Files.createTempFile(resourceName, suffix2);
try (InputStream inputStream1 = cl.getResource(resourceName).openStream();
OutputStream outputStream1 = Files.newOutputStream(path1);
InputStream inputStream2 = Files.newInputStream(path1);
OutputStream outputStream2 = Files.newOutputStream(path2)) {
producer1.produce(inputStream1, outputStream1);
producer2.produce(inputStream2, outputStream2);
assertThat("XML check of " + path1, path1, CompareMatcher.isIdenticalTo(cl.getResource(resourceName + suffix1).openStream()));
assertThat("XML check of " + path2, path2, CompareMatcher.isIdenticalTo(cl.getResource(resourceName + suffix2).openStream()));
} finally {
Files.delete(path1);
Files.delete(path2);
}
}
public static void generate(Class<?> cl, String resourceName, String suffix, Producer producer) throws IOException {
Path path = Paths.get("src/test/resources", cl.getPackageName().replace('.', '/'), resourceName + suffix);
logger.log(Level.INFO, "path = " + path);
try (InputStream inputStream = cl.getResource(resourceName).openStream();
URL url = cl.getResource(resourceName);
if (url != null) {
try (InputStream inputStream = url.openStream();
OutputStream outputStream = Files.newOutputStream(path)) {
producer.produce(inputStream, outputStream);
}
}
}
public static void assertStream(String name, Path path1, Path path2) throws IOException {
assertStream(name, Files.newInputStream(path1), Files.newInputStream(path2));

View file

@ -352,8 +352,7 @@ public class MarcJsonWriterTest {
"test_ubl.mrc"
}) {
StreamMatcher.fileMatch(getClass(), s, ".json", (inputStream, outputStream) -> {
try (MarcJsonWriter writer = new MarcJsonWriter(outputStream, EnumSet.of(MarcJsonWriter.Style.ALLOW_DUPLICATES))
) {
try (MarcJsonWriter writer = new MarcJsonWriter(outputStream, EnumSet.of(MarcJsonWriter.Style.ALLOW_DUPLICATES))) {
Marc.builder()
.setFormat(MarcXchangeConstants.MARCXCHANGE_FORMAT)
.setType(MarcXchangeConstants.BIBLIOGRAPHIC_TYPE)

File diff suppressed because one or more lines are too long