diff --git a/README.adoc b/README.adoc index f9f6fef..da4af2c 100644 --- a/README.adoc +++ b/README.adoc @@ -1,3 +1,7 @@ +// Use attribute to shorten urls +:repo: https://github.com/xbib/marc +:img: {repo}/raw/master/src.jbake/assets/images + # xbib MARC ## Bibliographic data processing library for Java @@ -127,6 +131,10 @@ The result is a very basic MARC field based index, which is cumbersome to config In upcoming projects, I will show how to turn MARC into semantic data with context, and indexing such data makes much more sense and is also more fun. +By executing `curl localhost:9200/_search?pretty` you can examine the result. + +image:{img}/marcxchange-in-elasticsearch.png[] + ## Bibliographic character sets Bibliographic character sets predate the era of Unicode. Before Unicode, characters sets were diff --git a/src/jbake/assets/images/marcxchange-in-elasticsearch.png b/src/jbake/assets/images/marcxchange-in-elasticsearch.png new file mode 100644 index 0000000..2fa6272 Binary files /dev/null and b/src/jbake/assets/images/marcxchange-in-elasticsearch.png differ diff --git a/src/main/java/org/xbib/marc/json/MarcJsonWriter.java b/src/main/java/org/xbib/marc/json/MarcJsonWriter.java index 7e96ea3..7587cd8 100644 --- a/src/main/java/org/xbib/marc/json/MarcJsonWriter.java +++ b/src/main/java/org/xbib/marc/json/MarcJsonWriter.java @@ -55,16 +55,16 @@ import java.util.zip.GZIPOutputStream; */ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Closeable { - private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName()); - - private static final int DEFAULT_BUFFER_SIZE = 65536; - public static final String LEADER_TAG = "_LEADER"; public static final String FORMAT_TAG = "_FORMAT"; public static final String TYPE_TAG = "_TYPE"; + private static final Logger logger = Logger.getLogger(MarcJsonWriter.class.getName()); + + private static final int DEFAULT_BUFFER_SIZE = 65536; + private final Lock lock; private final StringBuilder sb; @@ -451,17 +451,15 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo * Split records, if configured. */ private void afterRecord() { - if (fileNamePattern != null) { - if (getRecordCounter() % splitlimit == 0) { - try { - endCollection(); - close(); - newWriter(fileNamePattern, fileNameCounter, bufferSize, compress); - top = true; - beginCollection(); - } catch (IOException e) { - logger.log(Level.SEVERE, e.getMessage(), e); - } + if (fileNamePattern != null && getRecordCounter() % splitlimit == 0) { + try { + endCollection(); + close(); + newWriter(fileNamePattern, fileNameCounter, bufferSize, compress); + top = true; + beginCollection(); + } catch (IOException e) { + logger.log(Level.SEVERE, e.getMessage(), e); } } } @@ -476,12 +474,17 @@ public class MarcJsonWriter extends MarcContentHandler implements Flushable, Clo new BufferedOutputStream(out, bufferSize), StandardCharsets.UTF_8); } - private static final Pattern p = Pattern.compile("\"", Pattern.LITERAL); + private static final Pattern quotePattern = Pattern.compile("\"", Pattern.LITERAL); - private static final String replacement = "\\\""; + private static final String escapeQuote = "\\\""; + + private static final Pattern backslashPattern = Pattern.compile("\\\\"); + + private static final String escapeBackslash = "\\\\"; private static String escape(String value) { - return p.matcher(value).replaceAll(Matcher.quoteReplacement(replacement)); + String s = backslashPattern.matcher(value).replaceAll(Matcher.quoteReplacement(escapeBackslash)); + return quotePattern.matcher(s).replaceAll(Matcher.quoteReplacement(escapeQuote)); } private void writeMetaDataLine(MarcRecord marcRecord) { diff --git a/src/main/java/org/xbib/marc/xml/MarcXchangeWriter.java b/src/main/java/org/xbib/marc/xml/MarcXchangeWriter.java index 44d55fe..c341c76 100644 --- a/src/main/java/org/xbib/marc/xml/MarcXchangeWriter.java +++ b/src/main/java/org/xbib/marc/xml/MarcXchangeWriter.java @@ -485,17 +485,15 @@ public class MarcXchangeWriter extends MarcContentHandler implements Flushable, * Split records if configured. */ private void afterRecord() { - if (fileNamePattern != null) { - if (getRecordCounter() % splitlimit == 0) { - try { - endCollection(); - writer.close(); - newWriter(fileNamePattern, fileNameCounter, bufferSize, compress); - setupEventConsumer(writer, indent); - beginCollection(); - } catch (IOException e) { - logger.log(Level.SEVERE, e.getMessage(), e); - } + if (fileNamePattern != null && getRecordCounter() % splitlimit == 0) { + try { + endCollection(); + writer.close(); + newWriter(fileNamePattern, fileNameCounter, bufferSize, compress); + setupEventConsumer(writer, indent); + beginCollection(); + } catch (IOException e) { + logger.log(Level.SEVERE, e.getMessage(), e); } } } diff --git a/src/test/java/org/xbib/marc/json/MarcJsonWriterTest.java b/src/test/java/org/xbib/marc/json/MarcJsonWriterTest.java index f6c834f..ac87db5 100644 --- a/src/test/java/org/xbib/marc/json/MarcJsonWriterTest.java +++ b/src/test/java/org/xbib/marc/json/MarcJsonWriterTest.java @@ -35,12 +35,37 @@ import java.io.FileOutputStream; import java.io.InputStream; import java.nio.charset.Charset; import java.text.Normalizer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * */ public class MarcJsonWriterTest { + private static final Pattern quotePattern = Pattern.compile("\"", Pattern.LITERAL); + + private static final String escapeQuote = "\\\""; + + private static final Pattern backslashPattern = Pattern.compile("\\\\"); + + private static final String escapeBackslash = "\\\\"; + + private static String escape(String value) { + String s = backslashPattern.matcher(value).replaceAll(Matcher.quoteReplacement(escapeBackslash)); + return quotePattern.matcher(s).replaceAll(Matcher.quoteReplacement(escapeQuote)); + } + + @Test + public void testEscapeJSON() { + String s = "\"Hello world\""; + String t = escape(s); + assertEquals("\\\"Hello world\\\"", t); + s = "\\P123"; + t = escape(s); + assertEquals("\\\\P123", t); + } + /** * {@code }MarcJsonWriter} can receive MARC fields. * @@ -181,13 +206,17 @@ public class MarcJsonWriterTest { assertNull(writer.getException()); } File f0 = new File("build/0.json"); - assertTrue(f0.exists() && f0.length() == 6015); + assertTrue(f0.exists()); + assertEquals(6015, f0.length()); File f1 = new File("build/1.json"); - assertTrue(f1.exists() && f1.length() == 7127); + assertTrue(f1.exists()); + assertEquals(7130, f1.length()); File f2 = new File("build/2.json"); - assertTrue(f2.exists() && f2.length() == 6426); + assertTrue(f2.exists()); + assertEquals(6426, f2.length()); File f3 = new File("build/3.json"); - assertTrue(f3.exists() && f3.length() == 2110); + assertTrue(f3.exists()); + assertEquals(2110, f3.length()); File f4 = new File("build/4.json"); assertFalse(f4.exists()); } @@ -213,13 +242,17 @@ public class MarcJsonWriterTest { assertEquals(10, writer.getRecordCounter()); } File f0 = new File("build/bulk0.jsonl"); - assertTrue(f0.exists() && f0.length() == 6295); + assertTrue(f0.exists()); + assertEquals(6295, f0.length()); File f1 = new File("build/bulk1.jsonl"); - assertTrue(f1.exists() && f1.length() == 7407); + assertTrue(f1.exists()); + assertEquals(7410, f1.length()); File f2 = new File("build/bulk2.jsonl"); - assertTrue(f2.exists() && f2.length() == 6706); + assertTrue(f2.exists()); + assertEquals(6706, f2.length()); File f3 = new File("build/bulk3.jsonl"); - assertTrue(f3.exists() && f3.length() == 2204); + assertTrue(f3.exists()); + assertEquals(2204, f3.length()); File f4 = new File("build/bulk4.jsonl"); assertFalse(f4.exists()); } @@ -250,7 +283,7 @@ public class MarcJsonWriterTest { assertEquals(2141, f0.length()); File f1 = new File("build/bulk1.jsonl.gz"); assertTrue(f1.exists()); - assertEquals(2605, f1.length()); + assertEquals(2608, f1.length()); File f2 = new File("build/bulk2.jsonl.gz"); assertTrue(f2.exists()); assertEquals(2667, f2.length());