make replace faster in strict field validation, remove spurious replaces
This commit is contained in:
parent
9950003ea7
commit
6ae3134827
6 changed files with 67 additions and 101 deletions
|
@ -47,15 +47,13 @@ public class StrictMarcFieldValidator implements MarcFieldValidator {
|
||||||
public String validateTag(String tagCandidate) {
|
public String validateTag(String tagCandidate) {
|
||||||
String tag = tagCandidate;
|
String tag = tagCandidate;
|
||||||
if (tag != null) {
|
if (tag != null) {
|
||||||
// We have inconsistent use of tag symbols as placeholders for a "blank space"
|
|
||||||
// and we need to fix it here for consistency.
|
|
||||||
tag = tag.replace('-', BLANK)
|
|
||||||
.replace('#', BLANK)
|
|
||||||
.replace('.', BLANK)
|
|
||||||
.replace('_', BLANK);
|
|
||||||
// do not allow empty tags
|
// do not allow empty tags
|
||||||
if (tag.isEmpty()) {
|
if (tag.isEmpty()) {
|
||||||
tag = BLANK_TAG;
|
tag = BLANK_TAG;
|
||||||
|
} else {
|
||||||
|
// We have inconsistent use of tag symbols as placeholders for a "blank space"
|
||||||
|
// and we need to fix it here for consistency.
|
||||||
|
tag = tag.replaceAll("[-#.^_]", BLANK_STRING);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return tag;
|
return tag;
|
||||||
|
@ -65,16 +63,13 @@ public class StrictMarcFieldValidator implements MarcFieldValidator {
|
||||||
public String validateIndicator(String indicatorCandidate) {
|
public String validateIndicator(String indicatorCandidate) {
|
||||||
String indicator = indicatorCandidate;
|
String indicator = indicatorCandidate;
|
||||||
if (indicator != null) {
|
if (indicator != null) {
|
||||||
// We have inconsistent use of indicator symbols as placeholders for a "blank space"
|
|
||||||
// and we need to fix it here for consistency.
|
|
||||||
indicator = indicator
|
|
||||||
.replace('-', BLANK)
|
|
||||||
.replace('#', BLANK)
|
|
||||||
.replace('.', BLANK)
|
|
||||||
.replace('_', BLANK);
|
|
||||||
// we do not allow an empty indicator. Elasticsearch field names require a length > 0.
|
// we do not allow an empty indicator. Elasticsearch field names require a length > 0.
|
||||||
if (indicator.isEmpty()) {
|
if (indicator.isEmpty()) {
|
||||||
indicator = BLANK_STRING;
|
indicator = BLANK_STRING;
|
||||||
|
} else {
|
||||||
|
// We have inconsistent use of indicator symbols as placeholders for a "blank space"
|
||||||
|
// and we need to fix it here for consistency.
|
||||||
|
indicator = indicator.replaceAll("[-#.^_]", BLANK_STRING);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return indicator;
|
return indicator;
|
||||||
|
@ -84,15 +79,13 @@ public class StrictMarcFieldValidator implements MarcFieldValidator {
|
||||||
public String validateSubfieldId(String subfieldIdCandidate) {
|
public String validateSubfieldId(String subfieldIdCandidate) {
|
||||||
String id = subfieldIdCandidate;
|
String id = subfieldIdCandidate;
|
||||||
if (id != null) {
|
if (id != null) {
|
||||||
// We have inconsistent use of subfield id symbols as placeholders for a "blank space"
|
|
||||||
// and we need to fix it here for consistency.
|
|
||||||
id = id.replace('-', BLANK)
|
|
||||||
.replace('#', BLANK)
|
|
||||||
.replace('.', BLANK)
|
|
||||||
.replace('_', BLANK);
|
|
||||||
// we do not allow an empty subfield id. Elasticsearch field names require a length > 0.
|
// we do not allow an empty subfield id. Elasticsearch field names require a length > 0.
|
||||||
if (id.isEmpty()) {
|
if (id.isEmpty()) {
|
||||||
id = BLANK_STRING;
|
id = BLANK_STRING;
|
||||||
|
} else {
|
||||||
|
// We have inconsistent use of subfield id symbols as placeholders for a "blank space"
|
||||||
|
// and we need to fix it here for consistency.
|
||||||
|
id = id.replaceAll("[-#.^_]", BLANK_STRING);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return id;
|
return id;
|
||||||
|
|
|
@ -73,7 +73,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
||||||
|
|
||||||
private boolean fatalErrors;
|
private boolean fatalErrors;
|
||||||
|
|
||||||
private EnumSet<Style> style;
|
private final EnumSet<Style> style;
|
||||||
|
|
||||||
private Exception exception;
|
private Exception exception;
|
||||||
|
|
||||||
|
@ -83,7 +83,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
||||||
|
|
||||||
private int splitlimit;
|
private int splitlimit;
|
||||||
|
|
||||||
private int bufferSize;
|
private final int bufferSize;
|
||||||
|
|
||||||
private boolean compress;
|
private boolean compress;
|
||||||
|
|
||||||
|
@ -143,44 +143,6 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
||||||
newWriter(fileNamePattern, fileNameCounter, bufferSize, compress);
|
newWriter(fileNamePattern, fileNameCounter, bufferSize, compress);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String escape(String value) {
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
for (int i = 0; i < value.length(); i++) {
|
|
||||||
char c = value.charAt(i);
|
|
||||||
switch (c) {
|
|
||||||
case '"':
|
|
||||||
sb.append("\\\"");
|
|
||||||
break;
|
|
||||||
case '\\':
|
|
||||||
sb.append("\\\\");
|
|
||||||
break;
|
|
||||||
case '\b':
|
|
||||||
sb.append("\\b");
|
|
||||||
break;
|
|
||||||
case '\f':
|
|
||||||
sb.append("\\f");
|
|
||||||
break;
|
|
||||||
case '\n':
|
|
||||||
sb.append("\\n");
|
|
||||||
break;
|
|
||||||
case '\r':
|
|
||||||
sb.append("\\r");
|
|
||||||
break;
|
|
||||||
case '\t':
|
|
||||||
sb.append("\\t");
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
if (c < 0x1f) {
|
|
||||||
sb.append("\\u").append(String.format("%04x", (int) c));
|
|
||||||
} else {
|
|
||||||
sb.append(c);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public MarcJsonWriter setIndex(String index, String indexType) {
|
public MarcJsonWriter setIndex(String index, String indexType) {
|
||||||
this.index = index;
|
this.index = index;
|
||||||
this.indexType = indexType;
|
this.indexType = indexType;
|
||||||
|
@ -375,7 +337,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
||||||
jsonWriter.writeArrayClose();
|
jsonWriter.writeArrayClose();
|
||||||
} else {
|
} else {
|
||||||
jsonWriter.writeObjectOpen();
|
jsonWriter.writeObjectOpen();
|
||||||
jsonWriter.writeMemberName(marcField.getIndicator().replace(' ', '_'));
|
jsonWriter.writeMemberName(marcField.getIndicator());
|
||||||
jsonWriter.writeMemberSeparator();
|
jsonWriter.writeMemberSeparator();
|
||||||
jsonWriter.writeArrayOpen();
|
jsonWriter.writeArrayOpen();
|
||||||
boolean subfieldseparator = false;
|
boolean subfieldseparator = false;
|
||||||
|
@ -399,7 +361,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write MARC record from underlying map as key-oriented JSON.
|
* Write MARC record from underlying map as key-oriented JSON. Use repeat maps to create lists.
|
||||||
* @param marcRecord the MARC record
|
* @param marcRecord the MARC record
|
||||||
* @throws IOException if writing fails
|
* @throws IOException if writing fails
|
||||||
*/
|
*/
|
||||||
|
@ -583,15 +545,16 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
||||||
jsonWriter = new JsonWriter(writer);
|
jsonWriter = new JsonWriter(writer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
private void writeMetaDataLine(MarcRecord marcRecord) {
|
private void writeMetaDataLine(MarcRecord marcRecord) {
|
||||||
String id;
|
String id;
|
||||||
Object object = marcRecord.get("001");
|
Object object = marcRecord.get("001");
|
||||||
// step down to indicator/subfield ID levels if possible, get first value, assuming single field/value in 001
|
// step down to indicator/subfield ID levels if possible, get first value, assuming single field/value in 001
|
||||||
if (object instanceof Map) {
|
if (object instanceof Map) {
|
||||||
object = ((Map) object).values().iterator().next();
|
object = ((Map<String, Object>) object).values().iterator().next();
|
||||||
}
|
}
|
||||||
if (object instanceof Map) {
|
if (object instanceof Map) {
|
||||||
object = ((Map) object).values().iterator().next();
|
object = ((Map<String, Object>) object).values().iterator().next();
|
||||||
}
|
}
|
||||||
id = object.toString();
|
id = object.toString();
|
||||||
if (index != null && indexType != null && id != null) {
|
if (index != null && indexType != null && id != null) {
|
||||||
|
@ -614,6 +577,44 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static String escape(String value) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (int i = 0; i < value.length(); i++) {
|
||||||
|
char c = value.charAt(i);
|
||||||
|
switch (c) {
|
||||||
|
case '"':
|
||||||
|
sb.append("\\\"");
|
||||||
|
break;
|
||||||
|
case '\\':
|
||||||
|
sb.append("\\\\");
|
||||||
|
break;
|
||||||
|
case '\b':
|
||||||
|
sb.append("\\b");
|
||||||
|
break;
|
||||||
|
case '\f':
|
||||||
|
sb.append("\\f");
|
||||||
|
break;
|
||||||
|
case '\n':
|
||||||
|
sb.append("\\n");
|
||||||
|
break;
|
||||||
|
case '\r':
|
||||||
|
sb.append("\\r");
|
||||||
|
break;
|
||||||
|
case '\t':
|
||||||
|
sb.append("\\t");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (c < 0x1f) {
|
||||||
|
sb.append("\\u").append(String.format("%04x", (int) c));
|
||||||
|
} else {
|
||||||
|
sb.append(c);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -133,19 +133,7 @@ public class InverseMarcContentHandler implements MarcListener, MarcXchangeConst
|
||||||
attrs.addAttribute(EMPTY_STRING, TAG_ATTRIBUTE, prefix(TAG_ATTRIBUTE), CDATA, field.getTag());
|
attrs.addAttribute(EMPTY_STRING, TAG_ATTRIBUTE, prefix(TAG_ATTRIBUTE), CDATA, field.getTag());
|
||||||
contentHandler.startElement(nsUri, CONTROLFIELD, prefix(CONTROLFIELD), attrs);
|
contentHandler.startElement(nsUri, CONTROLFIELD, prefix(CONTROLFIELD), attrs);
|
||||||
String value = field.getValue();
|
String value = field.getValue();
|
||||||
if (value != null && !value.isEmpty()) {
|
|
||||||
switch (field.getTag()) {
|
|
||||||
case "006":
|
|
||||||
case "007":
|
|
||||||
case "008":
|
|
||||||
// fix wrong fill characters here
|
|
||||||
value = value.replace('^', '|');
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
contentHandler.characters(value.toCharArray(), 0, value.length());
|
contentHandler.characters(value.toCharArray(), 0, value.length());
|
||||||
}
|
|
||||||
contentHandler.endElement(nsUri, CONTROLFIELD, prefix(CONTROLFIELD));
|
contentHandler.endElement(nsUri, CONTROLFIELD, prefix(CONTROLFIELD));
|
||||||
} else {
|
} else {
|
||||||
String tag = field.getTag();
|
String tag = field.getTag();
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
|
import java.net.URL;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.channels.Channels;
|
import java.nio.channels.Channels;
|
||||||
import java.nio.channels.ReadableByteChannel;
|
import java.nio.channels.ReadableByteChannel;
|
||||||
|
@ -91,33 +92,17 @@ public class StreamMatcher {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void compareTwoXmls(Class<?> cl, String resourceName,
|
|
||||||
String suffix1, Producer producer1,
|
|
||||||
String suffix2, Producer producer2) throws IOException {
|
|
||||||
Path path1 = Files.createTempFile(resourceName, suffix1);
|
|
||||||
Path path2 = Files.createTempFile(resourceName, suffix2);
|
|
||||||
try (InputStream inputStream1 = cl.getResource(resourceName).openStream();
|
|
||||||
OutputStream outputStream1 = Files.newOutputStream(path1);
|
|
||||||
InputStream inputStream2 = Files.newInputStream(path1);
|
|
||||||
OutputStream outputStream2 = Files.newOutputStream(path2)) {
|
|
||||||
producer1.produce(inputStream1, outputStream1);
|
|
||||||
producer2.produce(inputStream2, outputStream2);
|
|
||||||
assertThat("XML check of " + path1, path1, CompareMatcher.isIdenticalTo(cl.getResource(resourceName + suffix1).openStream()));
|
|
||||||
assertThat("XML check of " + path2, path2, CompareMatcher.isIdenticalTo(cl.getResource(resourceName + suffix2).openStream()));
|
|
||||||
} finally {
|
|
||||||
Files.delete(path1);
|
|
||||||
Files.delete(path2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void generate(Class<?> cl, String resourceName, String suffix, Producer producer) throws IOException {
|
public static void generate(Class<?> cl, String resourceName, String suffix, Producer producer) throws IOException {
|
||||||
Path path = Paths.get("src/test/resources", cl.getPackageName().replace('.', '/'), resourceName + suffix);
|
Path path = Paths.get("src/test/resources", cl.getPackageName().replace('.', '/'), resourceName + suffix);
|
||||||
logger.log(Level.INFO, "path = " + path);
|
logger.log(Level.INFO, "path = " + path);
|
||||||
try (InputStream inputStream = cl.getResource(resourceName).openStream();
|
URL url = cl.getResource(resourceName);
|
||||||
|
if (url != null) {
|
||||||
|
try (InputStream inputStream = url.openStream();
|
||||||
OutputStream outputStream = Files.newOutputStream(path)) {
|
OutputStream outputStream = Files.newOutputStream(path)) {
|
||||||
producer.produce(inputStream, outputStream);
|
producer.produce(inputStream, outputStream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static void assertStream(String name, Path path1, Path path2) throws IOException {
|
public static void assertStream(String name, Path path1, Path path2) throws IOException {
|
||||||
assertStream(name, Files.newInputStream(path1), Files.newInputStream(path2));
|
assertStream(name, Files.newInputStream(path1), Files.newInputStream(path2));
|
||||||
|
|
|
@ -352,8 +352,7 @@ public class MarcJsonWriterTest {
|
||||||
"test_ubl.mrc"
|
"test_ubl.mrc"
|
||||||
}) {
|
}) {
|
||||||
StreamMatcher.fileMatch(getClass(), s, ".json", (inputStream, outputStream) -> {
|
StreamMatcher.fileMatch(getClass(), s, ".json", (inputStream, outputStream) -> {
|
||||||
try (MarcJsonWriter writer = new MarcJsonWriter(outputStream, EnumSet.of(MarcJsonWriter.Style.ALLOW_DUPLICATES))
|
try (MarcJsonWriter writer = new MarcJsonWriter(outputStream, EnumSet.of(MarcJsonWriter.Style.ALLOW_DUPLICATES))) {
|
||||||
) {
|
|
||||||
Marc.builder()
|
Marc.builder()
|
||||||
.setFormat(MarcXchangeConstants.MARCXCHANGE_FORMAT)
|
.setFormat(MarcXchangeConstants.MARCXCHANGE_FORMAT)
|
||||||
.setType(MarcXchangeConstants.BIBLIOGRAPHIC_TYPE)
|
.setType(MarcXchangeConstants.BIBLIOGRAPHIC_TYPE)
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue