From 6c80bd052f86b4232d4aee10f456736e9b726cc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=CC=88rg=20Prante?= Date: Mon, 6 Mar 2017 22:42:57 +0100 Subject: [PATCH] add ISSN example --- README.adoc | 68 +++++++++++++- build.gradle | 1 + gradle.properties | 3 +- .../java/org/xbib/marc/MarcFieldAdapter.java | 46 ++++++++++ .../xbib/marc/filter/MarcFieldFilterTest.java | 91 +++++++++++++++++++ .../resources/org/xbib/marc/filter/issns.mrc | 1 + 6 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 src/main/java/org/xbib/marc/MarcFieldAdapter.java create mode 100644 src/test/java/org/xbib/marc/filter/MarcFieldFilterTest.java create mode 100644 src/test/resources/org/xbib/marc/filter/issns.mrc diff --git a/README.adoc b/README.adoc index eb70717..e56a371 100644 --- a/README.adoc +++ b/README.adoc @@ -122,7 +122,6 @@ try (MarcJsonWriter writer = new MarcJsonWriter("bulk%d.jsonl.gz", 10000, .writeCollection(); } - ---- where the result can be indexed by a simple bash script using `curl`, because our JSON @@ -148,6 +147,73 @@ By executing `curl localhost:9200/_search?pretty` you can examine the result. image:{img}/marcxchange-in-elasticsearch.png[] +### Example: finding all ISSNs + +This Java program scans through a MARC file, checks for ISSN values, and collects them in +JSON format (the library `org.xbib:content-core:1.0.7` is used for JSON formatting) + +[source,java] +---- +public void findISSNs() throws IOException { + Map>> result = new TreeMap<>(); + // set up MARC listener + MarcListener marcListener = new MarcFieldAdapter() { + @Override + public void field(MarcField field) { + Collection> values = field.getSubfields().stream() + .filter(f -> matchISSNField(field, f)) + .map(f -> Collections.singletonMap(f.getId(), f.getValue())) + .collect(Collectors.toList()); + if (!values.isEmpty()) { + result.putIfAbsent(field.getTag(), new ArrayList<>()); + List> list = result.get(field.getTag()); + list.addAll(values); + result.put(field.getTag(), list); + } + } + }; + // read MARC file + Marc.builder() + .setInputStream(getClass().getResource("issns.mrc").openStream()) + .setMarcListener(marcListener) + .build() + .writeCollection(); + // collect ISSNs + List issns = result.values().stream() + .map(l -> l.stream() + .map(m -> m.values().iterator().next()) + .collect(Collectors.toList())) + .flatMap(List::stream) + .distinct() + .collect(Collectors.toList()); + + // JSON output + XContentBuilder builder = contentBuilder().prettyPrint() + .startObject(); + for (Map.Entry>> entry : result.entrySet()) { + builder.field(entry.getKey(), entry.getValue()); + } + builder.array("issns", issns); + builder.endObject(); + + logger.log(Level.INFO, builder.string()); +} + +private static boolean matchISSNField(MarcField field, MarcField.Subfield subfield) { + switch (field.getTag()) { + case "011": { + return "a".equals(subfield.getId()) || "f".equals(subfield.getId()); + } + case "421": + case "451": + case "452": + case "488": + return "x".equals(subfield.getId()); + } + return false; +} +---- + ## Bibliographic character sets Bibliographic character sets predate the era of Unicode. Before Unicode, characters sets were diff --git a/build.gradle b/build.gradle index 2abfaf1..0cfcb57 100644 --- a/build.gradle +++ b/build.gradle @@ -40,6 +40,7 @@ dependencies { testCompile "org.xmlunit:xmlunit-matchers:${project.property('xmlunit-matchers.version')}" testCompile "com.github.stefanbirkner:system-rules:${project.property('system-rules.version')}" testCompile "org.xbib:bibliographic-character-sets:${project.property('bibliographic-character-sets.version')}" + testCompile "org.xbib:content-core:${project.property('content-core.version')}" asciidoclet "org.asciidoctor:asciidoclet:${project.property('asciidoclet.version')}" wagon "org.apache.maven.wagon:wagon-ssh-external:${project.property('wagon-ssh-external.version')}" } diff --git a/gradle.properties b/gradle.properties index 35fc7bb..3cd29d4 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,11 +1,12 @@ group = org.xbib name = marc -version = 1.0.10 +version = 1.0.11 junit.version = 4.12 xalan.version = 2.7.2 xmlunit-matchers.version = 2.3.0 system-rules.version = 1.16.0 bibliographic-character-sets.version = 1.0.0 +content-core.version = 1.0.7 asciidoclet.version = 1.5.4 wagon-ssh-external.version = 2.10 diff --git a/src/main/java/org/xbib/marc/MarcFieldAdapter.java b/src/main/java/org/xbib/marc/MarcFieldAdapter.java new file mode 100644 index 0000000..e4bdf5c --- /dev/null +++ b/src/main/java/org/xbib/marc/MarcFieldAdapter.java @@ -0,0 +1,46 @@ +/* + Copyright 2016 Jörg Prante + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + */ +package org.xbib.marc; + +/** + * A MARC listener with empty methods. Useful for extending. + */ +public class MarcFieldAdapter implements MarcListener { + @Override + public void beginCollection() { + } + + @Override + public void beginRecord(String format, String type) { + } + + @Override + public void leader(String label) { + } + + @Override + public void field(MarcField field) { + } + + @Override + public void endRecord() { + } + + @Override + public void endCollection() { + } +} diff --git a/src/test/java/org/xbib/marc/filter/MarcFieldFilterTest.java b/src/test/java/org/xbib/marc/filter/MarcFieldFilterTest.java new file mode 100644 index 0000000..0addafa --- /dev/null +++ b/src/test/java/org/xbib/marc/filter/MarcFieldFilterTest.java @@ -0,0 +1,91 @@ +package org.xbib.marc.filter; + +import static org.xbib.content.json.JsonXContent.contentBuilder; + +import org.junit.Test; +import org.xbib.content.XContentBuilder; +import org.xbib.marc.Marc; +import org.xbib.marc.MarcField; +import org.xbib.marc.MarcFieldAdapter; +import org.xbib.marc.MarcListener; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.stream.Collectors; + +/** + * Demo of collecting ISSNs from a MARC file. + * + * "issns.mrc" courtesy of Steven Hirren (steven.hirren.gmail.com) + */ +public class MarcFieldFilterTest { + + private static final Logger logger = Logger.getLogger(MarcFieldFilterTest.class.getName()); + + @Test + public void findISSNs() throws IOException { + Map>> result = new TreeMap<>(); + // set up MARC listener + MarcListener marcListener = new MarcFieldAdapter() { + @Override + public void field(MarcField field) { + Collection> values = field.getSubfields().stream() + .filter(f -> matchISSNField(field, f)) + .map(f -> Collections.singletonMap(f.getId(), f.getValue())) + .collect(Collectors.toList()); + if (!values.isEmpty()) { + result.putIfAbsent(field.getTag(), new ArrayList<>()); + List> list = result.get(field.getTag()); + list.addAll(values); + result.put(field.getTag(), list); + } + } + }; + // read MARC file + Marc.builder() + .setInputStream(getClass().getResource("issns.mrc").openStream()) + .setMarcListener(marcListener) + .build() + .writeCollection(); + // collect ISSNs + List issns = result.values().stream() + .map(l -> l.stream() + .map(m -> m.values().iterator().next()) + .collect(Collectors.toList())) + .flatMap(List::stream) + .distinct() + .collect(Collectors.toList()); + + // JSON output + XContentBuilder builder = contentBuilder().prettyPrint() + .startObject(); + for (Map.Entry>> entry : result.entrySet()) { + builder.field(entry.getKey(), entry.getValue()); + } + builder.array("issns", issns); + builder.endObject(); + + logger.log(Level.INFO, builder.string()); + } + + private static boolean matchISSNField(MarcField field, MarcField.Subfield subfield) { + switch (field.getTag()) { + case "011": { + return "a".equals(subfield.getId()) || "f".equals(subfield.getId()); + } + case "421": + case "451": + case "452": + case "488": + return "x".equals(subfield.getId()); + } + return false; + } +} diff --git a/src/test/resources/org/xbib/marc/filter/issns.mrc b/src/test/resources/org/xbib/marc/filter/issns.mrc new file mode 100644 index 0000000..cef39d9 --- /dev/null +++ b/src/test/resources/org/xbib/marc/filter/issns.mrc @@ -0,0 +1 @@ +00675cas a2200253 a 450 001000300000005001300003011002500016035001900041100004100060102000700101105001800108106000600126110001600132200002200148207003100170210004500201452004200246530002200288531001600310676001600326801001300342801001900355856004700374202013060511330 a0261-3794f0261-3794 a(ISSN)02613794 a20130605a19829999 ba aGB a 1  ar aaha 1 aElectoral studies0 aVol. 1, no. 1 (Apr. 1982)- aGuildfordcButterworth Scientificd1982-11tElectoral studies (Online)x1873-6890 aElectoral studies aElect. stud a324.605v19 0aFRbISSN 2aGRbEKTgAACR24 uhttps://eskep.ekt.gr/eskep/journal/show/2000874cas a2200301 450 001000300000005001300003011002500016035001900041040001100060100004100071102000700112105001800119106000600137110001600143200003400159207002100193210004600214321003400260452005400294530003400348531001900382676001100401711003000412801001300442801001900455856005100474856004700525402013060511270 a0001-4842f0001-4842 a(ISSN)00014842 aACHRE4 a20130605a19689999 ba aUS a 1  ar aafa 1 aAccounts of chemical research0 av. 1- Jan. 1968- aWashington, DCcAmerican Chemical Society aChemical abstractsx0009-225810tAccounts of chemical research (Online)x1520-4898 aAccounts of chemical research aAcc. chem. res a540/.5 2aAmerican Chemical Society 0aFRbISSN 2aGRbEKTgAACR24 uhttp://pubs.acs.org/journals/achre4/index.html4 uhttps://eskep.ekt.gr/eskep/journal/show/4000927cas a2200301 i 450 001000300000005001300003011002500016035001900041100004100060102000700101106000600108110001600114200002300130210003000153421004800183421003500231421004100266434004300307434004800350452005400398530002300452531001800475675002400493711002900517801001300546801001900559856004700578602013060511310 a0001-5342f0001-5342 a(ISSN)00015342 a20130605a19359999 | ba aNL ar aah 1 aActa biotheoretica aLeidencRijksuniversiteit 0tActa biotheoretica. Supplementumx0169-7242 0tFolia biotheoreticax0920-2676 1tBibliotheca biotheoreticax0373-3408 tBibliographia biotheoreticax0920-2684 tActa biotheoretica. Supplementumx0169-724211tActa biotheoretica (Dordrecht. Online)x1572-8358 aActa biotheoretica aActa biotheor a573v15th Dutch ed. 2aRijksuniversiteit Leiden 0aFRbISSN 2aGRbEKTgAACR24 uhttps://eskep.ekt.gr/eskep/journal/show/60 \ No newline at end of file