fixing JSON escape, add Elasticsearch result example

This commit is contained in:
Jörg Prante 2016-09-28 14:45:32 +02:00
parent 957209c99e
commit e34eedc86d
5 changed files with 80 additions and 38 deletions

View file

@ -1,3 +1,7 @@
// Use attribute to shorten urls
:repo: https://github.com/xbib/marc
:img: {repo}/raw/master/src.jbake/assets/images
# xbib MARC # xbib MARC
## Bibliographic data processing library for Java ## Bibliographic data processing library for Java
@ -127,6 +131,10 @@ The result is a very basic MARC field based index, which is cumbersome to config
In upcoming projects, I will show how to turn MARC into semantic data with context, In upcoming projects, I will show how to turn MARC into semantic data with context,
and indexing such data makes much more sense and is also more fun. and indexing such data makes much more sense and is also more fun.
By executing `curl localhost:9200/_search?pretty` you can examine the result.
image:{img}/marcxchange-in-elasticsearch.png[]
## Bibliographic character sets ## Bibliographic character sets
Bibliographic character sets predate the era of Unicode. Before Unicode, characters sets were Bibliographic character sets predate the era of Unicode. Before Unicode, characters sets were

Binary file not shown.

After

Width:  |  Height:  |  Size: 159 KiB

View file

@ -55,16 +55,16 @@ import java.util.zip.GZIPOutputStream;
*/ */
public class MarcJsonWriter extends MarcContentHandler implements Flushable, Closeable { public class MarcJsonWriter extends MarcContentHandler implements Flushable, Closeable {
private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName());
private static final int DEFAULT_BUFFER_SIZE = 65536;
public static final String LEADER_TAG = "_LEADER"; public static final String LEADER_TAG = "_LEADER";
public static final String FORMAT_TAG = "_FORMAT"; public static final String FORMAT_TAG = "_FORMAT";
public static final String TYPE_TAG = "_TYPE"; public static final String TYPE_TAG = "_TYPE";
private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName());
private static final int DEFAULT_BUFFER_SIZE = 65536;
private final Lock lock; private final Lock lock;
private final StringBuilder sb; private final StringBuilder sb;
@ -451,8 +451,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
* Split records, if configured. * Split records, if configured.
*/ */
private void afterRecord() { private void afterRecord() {
if (fileNamePattern != null) { if (fileNamePattern != null && getRecordCounter() % splitlimit == 0) {
if (getRecordCounter() % splitlimit == 0) {
try { try {
endCollection(); endCollection();
close(); close();
@ -464,7 +463,6 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
} }
} }
} }
}
private void newWriter(String fileNamePattern, AtomicInteger fileNameCounter, private void newWriter(String fileNamePattern, AtomicInteger fileNameCounter,
int bufferSize, boolean compress) throws IOException { int bufferSize, boolean compress) throws IOException {
@ -476,12 +474,17 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
new BufferedOutputStream(out, bufferSize), StandardCharsets.UTF_8); new BufferedOutputStream(out, bufferSize), StandardCharsets.UTF_8);
} }
private static final Pattern p = Pattern.compile("\"", Pattern.LITERAL); private static final Pattern quotePattern = Pattern.compile("\"", Pattern.LITERAL);
private static final String replacement = "\\\""; private static final String escapeQuote = "\\\"";
private static final Pattern backslashPattern = Pattern.compile("\\\\");
private static final String escapeBackslash = "\\\\";
private static String escape(String value) { private static String escape(String value) {
return p.matcher(value).replaceAll(Matcher.quoteReplacement(replacement)); String s = backslashPattern.matcher(value).replaceAll(Matcher.quoteReplacement(escapeBackslash));
return quotePattern.matcher(s).replaceAll(Matcher.quoteReplacement(escapeQuote));
} }
private void writeMetaDataLine(MarcRecord marcRecord) { private void writeMetaDataLine(MarcRecord marcRecord) {

View file

@ -485,8 +485,7 @@ public class MarcXchangeWriter extends MarcContentHandler implements Flushable,
* Split records if configured. * Split records if configured.
*/ */
private void afterRecord() { private void afterRecord() {
if (fileNamePattern != null) { if (fileNamePattern != null && getRecordCounter() % splitlimit == 0) {
if (getRecordCounter() % splitlimit == 0) {
try { try {
endCollection(); endCollection();
writer.close(); writer.close();
@ -498,7 +497,6 @@ public class MarcXchangeWriter extends MarcContentHandler implements Flushable,
} }
} }
} }
}
private void newWriter(String fileNamePattern, AtomicInteger fileNameCounter, private void newWriter(String fileNamePattern, AtomicInteger fileNameCounter,
int bufferSize, boolean compress) int bufferSize, boolean compress)

View file

@ -35,12 +35,37 @@ import java.io.FileOutputStream;
import java.io.InputStream; import java.io.InputStream;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** /**
* *
*/ */
public class MarcJsonWriterTest { public class MarcJsonWriterTest {
private static final Pattern quotePattern = Pattern.compile("\"", Pattern.LITERAL);
private static final String escapeQuote = "\\\"";
private static final Pattern backslashPattern = Pattern.compile("\\\\");
private static final String escapeBackslash = "\\\\";
private static String escape(String value) {
String s = backslashPattern.matcher(value).replaceAll(Matcher.quoteReplacement(escapeBackslash));
return quotePattern.matcher(s).replaceAll(Matcher.quoteReplacement(escapeQuote));
}
@Test
public void testEscapeJSON() {
String s = "\"Hello world\"";
String t = escape(s);
assertEquals("\\\"Hello world\\\"", t);
s = "\\P123";
t = escape(s);
assertEquals("\\\\P123", t);
}
/** /**
* {@code }MarcJsonWriter} can receive MARC fields. * {@code }MarcJsonWriter} can receive MARC fields.
* *
@ -181,13 +206,17 @@ public class MarcJsonWriterTest {
assertNull(writer.getException()); assertNull(writer.getException());
} }
File f0 = new File("build/0.json"); File f0 = new File("build/0.json");
assertTrue(f0.exists() && f0.length() == 6015); assertTrue(f0.exists());
assertEquals(6015, f0.length());
File f1 = new File("build/1.json"); File f1 = new File("build/1.json");
assertTrue(f1.exists() && f1.length() == 7127); assertTrue(f1.exists());
assertEquals(7130, f1.length());
File f2 = new File("build/2.json"); File f2 = new File("build/2.json");
assertTrue(f2.exists() && f2.length() == 6426); assertTrue(f2.exists());
assertEquals(6426, f2.length());
File f3 = new File("build/3.json"); File f3 = new File("build/3.json");
assertTrue(f3.exists() && f3.length() == 2110); assertTrue(f3.exists());
assertEquals(2110, f3.length());
File f4 = new File("build/4.json"); File f4 = new File("build/4.json");
assertFalse(f4.exists()); assertFalse(f4.exists());
} }
@ -213,13 +242,17 @@ public class MarcJsonWriterTest {
assertEquals(10, writer.getRecordCounter()); assertEquals(10, writer.getRecordCounter());
} }
File f0 = new File("build/bulk0.jsonl"); File f0 = new File("build/bulk0.jsonl");
assertTrue(f0.exists() && f0.length() == 6295); assertTrue(f0.exists());
assertEquals(6295, f0.length());
File f1 = new File("build/bulk1.jsonl"); File f1 = new File("build/bulk1.jsonl");
assertTrue(f1.exists() && f1.length() == 7407); assertTrue(f1.exists());
assertEquals(7410, f1.length());
File f2 = new File("build/bulk2.jsonl"); File f2 = new File("build/bulk2.jsonl");
assertTrue(f2.exists() && f2.length() == 6706); assertTrue(f2.exists());
assertEquals(6706, f2.length());
File f3 = new File("build/bulk3.jsonl"); File f3 = new File("build/bulk3.jsonl");
assertTrue(f3.exists() && f3.length() == 2204); assertTrue(f3.exists());
assertEquals(2204, f3.length());
File f4 = new File("build/bulk4.jsonl"); File f4 = new File("build/bulk4.jsonl");
assertFalse(f4.exists()); assertFalse(f4.exists());
} }
@ -250,7 +283,7 @@ public class MarcJsonWriterTest {
assertEquals(2141, f0.length()); assertEquals(2141, f0.length());
File f1 = new File("build/bulk1.jsonl.gz"); File f1 = new File("build/bulk1.jsonl.gz");
assertTrue(f1.exists()); assertTrue(f1.exists());
assertEquals(2605, f1.length()); assertEquals(2608, f1.length());
File f2 = new File("build/bulk2.jsonl.gz"); File f2 = new File("build/bulk2.jsonl.gz");
assertTrue(f2.exists()); assertTrue(f2.exists());
assertEquals(2667, f2.length()); assertEquals(2667, f2.length());