fixing JSON escape, add Elasticsearch result example
This commit is contained in:
parent
957209c99e
commit
e34eedc86d
5 changed files with 80 additions and 38 deletions
|
@ -1,3 +1,7 @@
|
||||||
|
// Use attribute to shorten urls
|
||||||
|
:repo: https://github.com/xbib/marc
|
||||||
|
:img: {repo}/raw/master/src.jbake/assets/images
|
||||||
|
|
||||||
# xbib MARC
|
# xbib MARC
|
||||||
|
|
||||||
## Bibliographic data processing library for Java
|
## Bibliographic data processing library for Java
|
||||||
|
@ -127,6 +131,10 @@ The result is a very basic MARC field based index, which is cumbersome to config
|
||||||
In upcoming projects, I will show how to turn MARC into semantic data with context,
|
In upcoming projects, I will show how to turn MARC into semantic data with context,
|
||||||
and indexing such data makes much more sense and is also more fun.
|
and indexing such data makes much more sense and is also more fun.
|
||||||
|
|
||||||
|
By executing `curl localhost:9200/_search?pretty` you can examine the result.
|
||||||
|
|
||||||
|
image:{img}/marcxchange-in-elasticsearch.png[]
|
||||||
|
|
||||||
## Bibliographic character sets
|
## Bibliographic character sets
|
||||||
|
|
||||||
Bibliographic character sets predate the era of Unicode. Before Unicode, characters sets were
|
Bibliographic character sets predate the era of Unicode. Before Unicode, characters sets were
|
||||||
|
|
BIN
src/jbake/assets/images/marcxchange-in-elasticsearch.png
Normal file
BIN
src/jbake/assets/images/marcxchange-in-elasticsearch.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 159 KiB |
|
@ -55,16 +55,16 @@ import java.util.zip.GZIPOutputStream;
|
||||||
*/
|
*/
|
||||||
public class MarcJsonWriter extends MarcContentHandler implements Flushable, Closeable {
|
public class MarcJsonWriter extends MarcContentHandler implements Flushable, Closeable {
|
||||||
|
|
||||||
private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName());
|
|
||||||
|
|
||||||
private static final int DEFAULT_BUFFER_SIZE = 65536;
|
|
||||||
|
|
||||||
public static final String LEADER_TAG = "_LEADER";
|
public static final String LEADER_TAG = "_LEADER";
|
||||||
|
|
||||||
public static final String FORMAT_TAG = "_FORMAT";
|
public static final String FORMAT_TAG = "_FORMAT";
|
||||||
|
|
||||||
public static final String TYPE_TAG = "_TYPE";
|
public static final String TYPE_TAG = "_TYPE";
|
||||||
|
|
||||||
|
private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName());
|
||||||
|
|
||||||
|
private static final int DEFAULT_BUFFER_SIZE = 65536;
|
||||||
|
|
||||||
private final Lock lock;
|
private final Lock lock;
|
||||||
|
|
||||||
private final StringBuilder sb;
|
private final StringBuilder sb;
|
||||||
|
@ -451,17 +451,15 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
||||||
* Split records, if configured.
|
* Split records, if configured.
|
||||||
*/
|
*/
|
||||||
private void afterRecord() {
|
private void afterRecord() {
|
||||||
if (fileNamePattern != null) {
|
if (fileNamePattern != null && getRecordCounter() % splitlimit == 0) {
|
||||||
if (getRecordCounter() % splitlimit == 0) {
|
try {
|
||||||
try {
|
endCollection();
|
||||||
endCollection();
|
close();
|
||||||
close();
|
newWriter(fileNamePattern, fileNameCounter, bufferSize, compress);
|
||||||
newWriter(fileNamePattern, fileNameCounter, bufferSize, compress);
|
top = true;
|
||||||
top = true;
|
beginCollection();
|
||||||
beginCollection();
|
} catch (IOException e) {
|
||||||
} catch (IOException e) {
|
logger.log(Level.SEVERE, e.getMessage(), e);
|
||||||
logger.log(Level.SEVERE, e.getMessage(), e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -476,12 +474,17 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
||||||
new BufferedOutputStream(out, bufferSize), StandardCharsets.UTF_8);
|
new BufferedOutputStream(out, bufferSize), StandardCharsets.UTF_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern p = Pattern.compile("\"", Pattern.LITERAL);
|
private static final Pattern quotePattern = Pattern.compile("\"", Pattern.LITERAL);
|
||||||
|
|
||||||
private static final String replacement = "\\\"";
|
private static final String escapeQuote = "\\\"";
|
||||||
|
|
||||||
|
private static final Pattern backslashPattern = Pattern.compile("\\\\");
|
||||||
|
|
||||||
|
private static final String escapeBackslash = "\\\\";
|
||||||
|
|
||||||
private static String escape(String value) {
|
private static String escape(String value) {
|
||||||
return p.matcher(value).replaceAll(Matcher.quoteReplacement(replacement));
|
String s = backslashPattern.matcher(value).replaceAll(Matcher.quoteReplacement(escapeBackslash));
|
||||||
|
return quotePattern.matcher(s).replaceAll(Matcher.quoteReplacement(escapeQuote));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeMetaDataLine(MarcRecord marcRecord) {
|
private void writeMetaDataLine(MarcRecord marcRecord) {
|
||||||
|
|
|
@ -485,17 +485,15 @@ public class MarcXchangeWriter extends MarcContentHandler implements Flushable,
|
||||||
* Split records if configured.
|
* Split records if configured.
|
||||||
*/
|
*/
|
||||||
private void afterRecord() {
|
private void afterRecord() {
|
||||||
if (fileNamePattern != null) {
|
if (fileNamePattern != null && getRecordCounter() % splitlimit == 0) {
|
||||||
if (getRecordCounter() % splitlimit == 0) {
|
try {
|
||||||
try {
|
endCollection();
|
||||||
endCollection();
|
writer.close();
|
||||||
writer.close();
|
newWriter(fileNamePattern, fileNameCounter, bufferSize, compress);
|
||||||
newWriter(fileNamePattern, fileNameCounter, bufferSize, compress);
|
setupEventConsumer(writer, indent);
|
||||||
setupEventConsumer(writer, indent);
|
beginCollection();
|
||||||
beginCollection();
|
} catch (IOException e) {
|
||||||
} catch (IOException e) {
|
logger.log(Level.SEVERE, e.getMessage(), e);
|
||||||
logger.log(Level.SEVERE, e.getMessage(), e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,12 +35,37 @@ import java.io.FileOutputStream;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class MarcJsonWriterTest {
|
public class MarcJsonWriterTest {
|
||||||
|
|
||||||
|
private static final Pattern quotePattern = Pattern.compile("\"", Pattern.LITERAL);
|
||||||
|
|
||||||
|
private static final String escapeQuote = "\\\"";
|
||||||
|
|
||||||
|
private static final Pattern backslashPattern = Pattern.compile("\\\\");
|
||||||
|
|
||||||
|
private static final String escapeBackslash = "\\\\";
|
||||||
|
|
||||||
|
private static String escape(String value) {
|
||||||
|
String s = backslashPattern.matcher(value).replaceAll(Matcher.quoteReplacement(escapeBackslash));
|
||||||
|
return quotePattern.matcher(s).replaceAll(Matcher.quoteReplacement(escapeQuote));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEscapeJSON() {
|
||||||
|
String s = "\"Hello world\"";
|
||||||
|
String t = escape(s);
|
||||||
|
assertEquals("\\\"Hello world\\\"", t);
|
||||||
|
s = "\\P123";
|
||||||
|
t = escape(s);
|
||||||
|
assertEquals("\\\\P123", t);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@code }MarcJsonWriter} can receive MARC fields.
|
* {@code }MarcJsonWriter} can receive MARC fields.
|
||||||
*
|
*
|
||||||
|
@ -181,13 +206,17 @@ public class MarcJsonWriterTest {
|
||||||
assertNull(writer.getException());
|
assertNull(writer.getException());
|
||||||
}
|
}
|
||||||
File f0 = new File("build/0.json");
|
File f0 = new File("build/0.json");
|
||||||
assertTrue(f0.exists() && f0.length() == 6015);
|
assertTrue(f0.exists());
|
||||||
|
assertEquals(6015, f0.length());
|
||||||
File f1 = new File("build/1.json");
|
File f1 = new File("build/1.json");
|
||||||
assertTrue(f1.exists() && f1.length() == 7127);
|
assertTrue(f1.exists());
|
||||||
|
assertEquals(7130, f1.length());
|
||||||
File f2 = new File("build/2.json");
|
File f2 = new File("build/2.json");
|
||||||
assertTrue(f2.exists() && f2.length() == 6426);
|
assertTrue(f2.exists());
|
||||||
|
assertEquals(6426, f2.length());
|
||||||
File f3 = new File("build/3.json");
|
File f3 = new File("build/3.json");
|
||||||
assertTrue(f3.exists() && f3.length() == 2110);
|
assertTrue(f3.exists());
|
||||||
|
assertEquals(2110, f3.length());
|
||||||
File f4 = new File("build/4.json");
|
File f4 = new File("build/4.json");
|
||||||
assertFalse(f4.exists());
|
assertFalse(f4.exists());
|
||||||
}
|
}
|
||||||
|
@ -213,13 +242,17 @@ public class MarcJsonWriterTest {
|
||||||
assertEquals(10, writer.getRecordCounter());
|
assertEquals(10, writer.getRecordCounter());
|
||||||
}
|
}
|
||||||
File f0 = new File("build/bulk0.jsonl");
|
File f0 = new File("build/bulk0.jsonl");
|
||||||
assertTrue(f0.exists() && f0.length() == 6295);
|
assertTrue(f0.exists());
|
||||||
|
assertEquals(6295, f0.length());
|
||||||
File f1 = new File("build/bulk1.jsonl");
|
File f1 = new File("build/bulk1.jsonl");
|
||||||
assertTrue(f1.exists() && f1.length() == 7407);
|
assertTrue(f1.exists());
|
||||||
|
assertEquals(7410, f1.length());
|
||||||
File f2 = new File("build/bulk2.jsonl");
|
File f2 = new File("build/bulk2.jsonl");
|
||||||
assertTrue(f2.exists() && f2.length() == 6706);
|
assertTrue(f2.exists());
|
||||||
|
assertEquals(6706, f2.length());
|
||||||
File f3 = new File("build/bulk3.jsonl");
|
File f3 = new File("build/bulk3.jsonl");
|
||||||
assertTrue(f3.exists() && f3.length() == 2204);
|
assertTrue(f3.exists());
|
||||||
|
assertEquals(2204, f3.length());
|
||||||
File f4 = new File("build/bulk4.jsonl");
|
File f4 = new File("build/bulk4.jsonl");
|
||||||
assertFalse(f4.exists());
|
assertFalse(f4.exists());
|
||||||
}
|
}
|
||||||
|
@ -250,7 +283,7 @@ public class MarcJsonWriterTest {
|
||||||
assertEquals(2141, f0.length());
|
assertEquals(2141, f0.length());
|
||||||
File f1 = new File("build/bulk1.jsonl.gz");
|
File f1 = new File("build/bulk1.jsonl.gz");
|
||||||
assertTrue(f1.exists());
|
assertTrue(f1.exists());
|
||||||
assertEquals(2605, f1.length());
|
assertEquals(2608, f1.length());
|
||||||
File f2 = new File("build/bulk2.jsonl.gz");
|
File f2 = new File("build/bulk2.jsonl.gz");
|
||||||
assertTrue(f2.exists());
|
assertTrue(f2.exists());
|
||||||
assertEquals(2667, f2.length());
|
assertEquals(2667, f2.length());
|
||||||
|
|
Loading…
Reference in a new issue