fixing JSON escape, add Elasticsearch result example
This commit is contained in:
parent
957209c99e
commit
e34eedc86d
5 changed files with 80 additions and 38 deletions
|
@ -1,3 +1,7 @@
|
|||
// Use attribute to shorten urls
|
||||
:repo: https://github.com/xbib/marc
|
||||
:img: {repo}/raw/master/src.jbake/assets/images
|
||||
|
||||
# xbib MARC
|
||||
|
||||
## Bibliographic data processing library for Java
|
||||
|
@ -127,6 +131,10 @@ The result is a very basic MARC field based index, which is cumbersome to config
|
|||
In upcoming projects, I will show how to turn MARC into semantic data with context,
|
||||
and indexing such data makes much more sense and is also more fun.
|
||||
|
||||
By executing `curl localhost:9200/_search?pretty` you can examine the result.
|
||||
|
||||
image:{img}/marcxchange-in-elasticsearch.png[]
|
||||
|
||||
## Bibliographic character sets
|
||||
|
||||
Bibliographic character sets predate the era of Unicode. Before Unicode, characters sets were
|
||||
|
|
BIN
src/jbake/assets/images/marcxchange-in-elasticsearch.png
Normal file
BIN
src/jbake/assets/images/marcxchange-in-elasticsearch.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 159 KiB |
|
@ -55,16 +55,16 @@ import java.util.zip.GZIPOutputStream;
|
|||
*/
|
||||
public class MarcJsonWriter extends MarcContentHandler implements Flushable, Closeable {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName());
|
||||
|
||||
private static final int DEFAULT_BUFFER_SIZE = 65536;
|
||||
|
||||
public static final String LEADER_TAG = "_LEADER";
|
||||
|
||||
public static final String FORMAT_TAG = "_FORMAT";
|
||||
|
||||
public static final String TYPE_TAG = "_TYPE";
|
||||
|
||||
private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName());
|
||||
|
||||
private static final int DEFAULT_BUFFER_SIZE = 65536;
|
||||
|
||||
private final Lock lock;
|
||||
|
||||
private final StringBuilder sb;
|
||||
|
@ -451,8 +451,7 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
|||
* Split records, if configured.
|
||||
*/
|
||||
private void afterRecord() {
|
||||
if (fileNamePattern != null) {
|
||||
if (getRecordCounter() % splitlimit == 0) {
|
||||
if (fileNamePattern != null && getRecordCounter() % splitlimit == 0) {
|
||||
try {
|
||||
endCollection();
|
||||
close();
|
||||
|
@ -464,7 +463,6 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void newWriter(String fileNamePattern, AtomicInteger fileNameCounter,
|
||||
int bufferSize, boolean compress) throws IOException {
|
||||
|
@ -476,12 +474,17 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo
|
|||
new BufferedOutputStream(out, bufferSize), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
private static final Pattern p = Pattern.compile("\"", Pattern.LITERAL);
|
||||
private static final Pattern quotePattern = Pattern.compile("\"", Pattern.LITERAL);
|
||||
|
||||
private static final String replacement = "\\\"";
|
||||
private static final String escapeQuote = "\\\"";
|
||||
|
||||
private static final Pattern backslashPattern = Pattern.compile("\\\\");
|
||||
|
||||
private static final String escapeBackslash = "\\\\";
|
||||
|
||||
private static String escape(String value) {
|
||||
return p.matcher(value).replaceAll(Matcher.quoteReplacement(replacement));
|
||||
String s = backslashPattern.matcher(value).replaceAll(Matcher.quoteReplacement(escapeBackslash));
|
||||
return quotePattern.matcher(s).replaceAll(Matcher.quoteReplacement(escapeQuote));
|
||||
}
|
||||
|
||||
private void writeMetaDataLine(MarcRecord marcRecord) {
|
||||
|
|
|
@ -485,8 +485,7 @@ public class MarcXchangeWriter extends MarcContentHandler implements Flushable,
|
|||
* Split records if configured.
|
||||
*/
|
||||
private void afterRecord() {
|
||||
if (fileNamePattern != null) {
|
||||
if (getRecordCounter() % splitlimit == 0) {
|
||||
if (fileNamePattern != null && getRecordCounter() % splitlimit == 0) {
|
||||
try {
|
||||
endCollection();
|
||||
writer.close();
|
||||
|
@ -498,7 +497,6 @@ public class MarcXchangeWriter extends MarcContentHandler implements Flushable,
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void newWriter(String fileNamePattern, AtomicInteger fileNameCounter,
|
||||
int bufferSize, boolean compress)
|
||||
|
|
|
@ -35,12 +35,37 @@ import java.io.FileOutputStream;
|
|||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.Normalizer;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class MarcJsonWriterTest {
|
||||
|
||||
private static final Pattern quotePattern = Pattern.compile("\"", Pattern.LITERAL);
|
||||
|
||||
private static final String escapeQuote = "\\\"";
|
||||
|
||||
private static final Pattern backslashPattern = Pattern.compile("\\\\");
|
||||
|
||||
private static final String escapeBackslash = "\\\\";
|
||||
|
||||
private static String escape(String value) {
|
||||
String s = backslashPattern.matcher(value).replaceAll(Matcher.quoteReplacement(escapeBackslash));
|
||||
return quotePattern.matcher(s).replaceAll(Matcher.quoteReplacement(escapeQuote));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEscapeJSON() {
|
||||
String s = "\"Hello world\"";
|
||||
String t = escape(s);
|
||||
assertEquals("\\\"Hello world\\\"", t);
|
||||
s = "\\P123";
|
||||
t = escape(s);
|
||||
assertEquals("\\\\P123", t);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@code }MarcJsonWriter} can receive MARC fields.
|
||||
*
|
||||
|
@ -181,13 +206,17 @@ public class MarcJsonWriterTest {
|
|||
assertNull(writer.getException());
|
||||
}
|
||||
File f0 = new File("build/0.json");
|
||||
assertTrue(f0.exists() && f0.length() == 6015);
|
||||
assertTrue(f0.exists());
|
||||
assertEquals(6015, f0.length());
|
||||
File f1 = new File("build/1.json");
|
||||
assertTrue(f1.exists() && f1.length() == 7127);
|
||||
assertTrue(f1.exists());
|
||||
assertEquals(7130, f1.length());
|
||||
File f2 = new File("build/2.json");
|
||||
assertTrue(f2.exists() && f2.length() == 6426);
|
||||
assertTrue(f2.exists());
|
||||
assertEquals(6426, f2.length());
|
||||
File f3 = new File("build/3.json");
|
||||
assertTrue(f3.exists() && f3.length() == 2110);
|
||||
assertTrue(f3.exists());
|
||||
assertEquals(2110, f3.length());
|
||||
File f4 = new File("build/4.json");
|
||||
assertFalse(f4.exists());
|
||||
}
|
||||
|
@ -213,13 +242,17 @@ public class MarcJsonWriterTest {
|
|||
assertEquals(10, writer.getRecordCounter());
|
||||
}
|
||||
File f0 = new File("build/bulk0.jsonl");
|
||||
assertTrue(f0.exists() && f0.length() == 6295);
|
||||
assertTrue(f0.exists());
|
||||
assertEquals(6295, f0.length());
|
||||
File f1 = new File("build/bulk1.jsonl");
|
||||
assertTrue(f1.exists() && f1.length() == 7407);
|
||||
assertTrue(f1.exists());
|
||||
assertEquals(7410, f1.length());
|
||||
File f2 = new File("build/bulk2.jsonl");
|
||||
assertTrue(f2.exists() && f2.length() == 6706);
|
||||
assertTrue(f2.exists());
|
||||
assertEquals(6706, f2.length());
|
||||
File f3 = new File("build/bulk3.jsonl");
|
||||
assertTrue(f3.exists() && f3.length() == 2204);
|
||||
assertTrue(f3.exists());
|
||||
assertEquals(2204, f3.length());
|
||||
File f4 = new File("build/bulk4.jsonl");
|
||||
assertFalse(f4.exists());
|
||||
}
|
||||
|
@ -250,7 +283,7 @@ public class MarcJsonWriterTest {
|
|||
assertEquals(2141, f0.length());
|
||||
File f1 = new File("build/bulk1.jsonl.gz");
|
||||
assertTrue(f1.exists());
|
||||
assertEquals(2605, f1.length());
|
||||
assertEquals(2608, f1.length());
|
||||
File f2 = new File("build/bulk2.jsonl.gz");
|
||||
assertTrue(f2.exists());
|
||||
assertEquals(2667, f2.length());
|
||||
|
|
Loading…
Reference in a new issue