fixing JSON escape, add Elasticsearch result example

This commit is contained in:
Jörg Prante 2016-09-28 14:45:32 +02:00
parent 957209c99e
commit e34eedc86d
5 changed files with 80 additions and 38 deletions

View file

@ -1,3 +1,7 @@
// Use attribute to shorten urls
:repo: https://github.com/xbib/marc
:img: {repo}/raw/master/src.jbake/assets/images
# xbib MARC
## Bibliographic data processing library for Java
@ -127,6 +131,10 @@ The result is a very basic MARC field based index, which is cumbersome to config
In upcoming projects, I will show how to turn MARC into semantic data with context,
and indexing such data makes much more sense and is also more fun.
By executing `curl localhost:9200/_search?pretty` you can examine the result.
image:{img}/marcxchange-in-elasticsearch.png[]
## Bibliographic character sets
Bibliographic character sets predate the era of Unicode. Before Unicode, characters sets were

Binary file not shown.

After

Width:  |  Height:  |  Size: 159 KiB

View file

@ -55,16 +55,16 @@ import java.util.zip.GZIPOutputStream;
*/
public class MarcJsonWriter extends MarcContentHandler implements Flushable, Closeable {
private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName());
private static final int DEFAULT_BUFFER_SIZE = 65536;
public static final String LEADER_TAG = "_LEADER";
public static final String FORMAT_TAG = "_FORMAT";
public static final String TYPE_TAG = "_TYPE";
private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName());
private static final int DEFAULT_BUFFER_SIZE = 65536;
private final Lock lock;
private final StringBuilder sb;
@ -451,17 +451,15 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
* Split records, if configured.
*/
private void afterRecord() {
if (fileNamePattern != null) {
if (getRecordCounter() % splitlimit == 0) {
try {
endCollection();
close();
newWriter(fileNamePattern, fileNameCounter, bufferSize, compress);
top = true;
beginCollection();
} catch (IOException e) {
logger.log(Level.SEVERE, e.getMessage(), e);
}
if (fileNamePattern != null && getRecordCounter() % splitlimit == 0) {
try {
endCollection();
close();
newWriter(fileNamePattern, fileNameCounter, bufferSize, compress);
top = true;
beginCollection();
} catch (IOException e) {
logger.log(Level.SEVERE, e.getMessage(), e);
}
}
}
@ -476,12 +474,17 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
new BufferedOutputStream(out, bufferSize), StandardCharsets.UTF_8);
}
private static final Pattern p = Pattern.compile("\"", Pattern.LITERAL);
private static final Pattern quotePattern = Pattern.compile("\"", Pattern.LITERAL);
private static final String replacement = "\\\"";
private static final String escapeQuote = "\\\"";
private static final Pattern backslashPattern = Pattern.compile("\\\\");
private static final String escapeBackslash = "\\\\";
private static String escape(String value) {
return p.matcher(value).replaceAll(Matcher.quoteReplacement(replacement));
String s = backslashPattern.matcher(value).replaceAll(Matcher.quoteReplacement(escapeBackslash));
return quotePattern.matcher(s).replaceAll(Matcher.quoteReplacement(escapeQuote));
}
private void writeMetaDataLine(MarcRecord marcRecord) {

View file

@ -485,17 +485,15 @@ public class MarcXchangeWriter extends MarcContentHandler implements Flushable,
* Split records if configured.
*/
private void afterRecord() {
if (fileNamePattern != null) {
if (getRecordCounter() % splitlimit == 0) {
try {
endCollection();
writer.close();
newWriter(fileNamePattern, fileNameCounter, bufferSize, compress);
setupEventConsumer(writer, indent);
beginCollection();
} catch (IOException e) {
logger.log(Level.SEVERE, e.getMessage(), e);
}
if (fileNamePattern != null && getRecordCounter() % splitlimit == 0) {
try {
endCollection();
writer.close();
newWriter(fileNamePattern, fileNameCounter, bufferSize, compress);
setupEventConsumer(writer, indent);
beginCollection();
} catch (IOException e) {
logger.log(Level.SEVERE, e.getMessage(), e);
}
}
}

View file

@ -35,12 +35,37 @@ import java.io.FileOutputStream;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
*/
public class MarcJsonWriterTest {
private static final Pattern quotePattern = Pattern.compile("\"", Pattern.LITERAL);
private static final String escapeQuote = "\\\"";
private static final Pattern backslashPattern = Pattern.compile("\\\\");
private static final String escapeBackslash = "\\\\";
private static String escape(String value) {
String s = backslashPattern.matcher(value).replaceAll(Matcher.quoteReplacement(escapeBackslash));
return quotePattern.matcher(s).replaceAll(Matcher.quoteReplacement(escapeQuote));
}
@Test
public void testEscapeJSON() {
String s = "\"Hello world\"";
String t = escape(s);
assertEquals("\\\"Hello world\\\"", t);
s = "\\P123";
t = escape(s);
assertEquals("\\\\P123", t);
}
/**
* {@code }MarcJsonWriter} can receive MARC fields.
*
@ -181,13 +206,17 @@ public class MarcJsonWriterTest {
assertNull(writer.getException());
}
File f0 = new File("build/0.json");
assertTrue(f0.exists() && f0.length() == 6015);
assertTrue(f0.exists());
assertEquals(6015, f0.length());
File f1 = new File("build/1.json");
assertTrue(f1.exists() && f1.length() == 7127);
assertTrue(f1.exists());
assertEquals(7130, f1.length());
File f2 = new File("build/2.json");
assertTrue(f2.exists() && f2.length() == 6426);
assertTrue(f2.exists());
assertEquals(6426, f2.length());
File f3 = new File("build/3.json");
assertTrue(f3.exists() && f3.length() == 2110);
assertTrue(f3.exists());
assertEquals(2110, f3.length());
File f4 = new File("build/4.json");
assertFalse(f4.exists());
}
@ -213,13 +242,17 @@ public class MarcJsonWriterTest {
assertEquals(10, writer.getRecordCounter());
}
File f0 = new File("build/bulk0.jsonl");
assertTrue(f0.exists() && f0.length() == 6295);
assertTrue(f0.exists());
assertEquals(6295, f0.length());
File f1 = new File("build/bulk1.jsonl");
assertTrue(f1.exists() && f1.length() == 7407);
assertTrue(f1.exists());
assertEquals(7410, f1.length());
File f2 = new File("build/bulk2.jsonl");
assertTrue(f2.exists() && f2.length() == 6706);
assertTrue(f2.exists());
assertEquals(6706, f2.length());
File f3 = new File("build/bulk3.jsonl");
assertTrue(f3.exists() && f3.length() == 2204);
assertTrue(f3.exists());
assertEquals(2204, f3.length());
File f4 = new File("build/bulk4.jsonl");
assertFalse(f4.exists());
}
@ -250,7 +283,7 @@ public class MarcJsonWriterTest {
assertEquals(2141, f0.length());
File f1 = new File("build/bulk1.jsonl.gz");
assertTrue(f1.exists());
assertEquals(2605, f1.length());
assertEquals(2608, f1.length());
File f2 = new File("build/bulk2.jsonl.gz");
assertTrue(f2.exists());
assertEquals(2667, f2.length());