bibliographic-character-sets/src/main/java/org/xbib/charset/Pica.java

224 lines
8.9 KiB
Java

/*
* Licensed to Jörg Prante and xbib under one or more contributor
* license agreements. See the NOTICE.txt file distributed with this work
* for additional information regarding copyright ownership.
*
* Copyright (C) 2016 Jörg Prante and xbib
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program; if not, see http://www.gnu.org/licenses
* or write to the Free Software Foundation, Inc., 51 Franklin Street,
* Fifth Floor, Boston, MA 02110-1301 USA.
*
* The interactive user interfaces in modified source and object code
* versions of this program must display Appropriate Legal Notices,
* as required under Section 5 of the GNU Affero General Public License.
*
* In accordance with Section 7(b) of the GNU Affero General Public
* License, these Appropriate Legal Notices must retain the display of the
* "Powered by xbib" logo. If the display of the logo is not reasonably
* feasible for technical reasons, the Appropriate Legal Notices must display
* the words "Powered by xbib".
*/
package org.xbib.charset;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
/**
* PICA character set implementation.
*
* This character set is a modified version of the 'InterMARC' character set
* and contains 256 tokens.
*
* A description can be found at
* <a href="http://www.pica.nl/ne/docu/dn010/html/t07.shtml">the Pica website</a>.
*/
public class Pica extends Charset {
private static final Map<Character, Character> encodeMap = new HashMap<>();
private static final Map<Character, Character> decodeMap = new HashMap<>();
/*
* Pica character mapping for index subset \u00a0..\u00ff.
* Pica is equal to US-ASCII but not ISO-8859-1.
* These are the definitions for Pica characters
* which are different from ISO-8859-1.
*/
static {
Pica.charTable(new char[] {
'\u00a0', '\u0141', '\u00d8', '\u0110', '\u00de', '\u00c6',
'\u0152', '\u02b9', '\u00b7', '\u266d', '\u00ae', '\u00b1',
'\u01a0', '\u01af', '\u02be', '\u00c5', '\u02bf', '\u0142',
'\u00f8', '\u0111', '\u00fe', '\u00e6', '\u0153', '\u02ba',
'\u0131', '\u00a3', '\u00f0', '\u03b1', '\u01a1', '\u01b0',
'\u00df', '\u00e5', '\u0132', '\u00c4', '\u00d6', '\u00dc',
'\u0186', '\u018e', '\u2260', '\u2192', '\u2264', '\u221e',
'\u222b', '\u00d7', '\u00a7', '\u22a1', '\u21d4', '\u2265',
'\u0133', '\u00e4', '\u00f6', '\u00fc', '\u0254', '\u0258',
'\u00bf', '\u00a1', '\u03b2', '\u003f', '\u03b3', '\u03c0',
'\u003f', '\u003f', '\u003f', '\u003f', '\u0341', '\u0300',
'\u0301', '\u0302', '\u0303', '\u0304', '\u0306', '\u0307',
'\u0308', '\u030c', '\u030a', '\ufe20', '\ufe21', '\u0315',
'\u030b', '\u0310', '\u0327', '\u0000', '\u0323', '\u0324',
'\u0325', '\u0333', '\u0332', '\u003f', '\u031c', '\u032e',
'\ufe23', '\ufe22', '\u003f', '\u0000', '\u0313', '\u003f'
});
}
// Handle to the real charset we'll use for transcoding between
// characters and bytes. Doing this allows applying the Pica
// charset to multi-byte charset encodings like UTF-8.
private final Charset encodeCharset;
/**
* Constructor for the Pica charset. Call the superclass
* constructor to pass along the name(s) we'll be known by.
* Then save a reference to the delegate Charset.
*/
public Pica() {
super("PICA", BibliographicCharsetProvider.aliasesFor("PICA"));
encodeCharset = StandardCharsets.ISO_8859_1;
}
/**
* Fill the conversion tables.
*/
private static void charTable(char[] code) {
int i = 0;
for (char c = '\u00a0'; c <= '\u00ff'; c++) {
if (code[i] != '\u0000') {
encodeMap.put(code[i], c);
decodeMap.put(c, code[i]);
}
i++;
}
}
/**
* This method must be implemented by concrete Charsets. We allow
* subclasses of the Pica charset.
*/
public boolean contains(Charset charset) {
return charset instanceof Pica;
}
/**
* Called by users of this Charset to obtain an encoder.
* This implementation instantiates an instance of a private class
* (defined below) and passes it an encoder from the base Charset.
*/
public CharsetEncoder newEncoder() {
return new PicaEncoder(this, encodeCharset.newEncoder());
}
/**
* Called by users of this Charset to obtain a decoder.
* This implementation instantiates an instance of a private class
* (defined below) and passes it a decoder from the base Charset.
*/
public CharsetDecoder newDecoder() {
return new PicaDecoder(this, encodeCharset.newDecoder());
}
private static class PicaEncoder extends CharsetEncoder {
private final CharsetEncoder baseEncoder;
/**
* Constructor, call the superclass constructor with the
* Charset object and the encodings sizes from the
* delegate encoder.
*/
PicaEncoder(Charset cs, CharsetEncoder baseEncoder) {
super(cs, baseEncoder.averageBytesPerChar(),
baseEncoder.maxBytesPerChar());
this.baseEncoder = baseEncoder;
}
/**
* Implementation of the encoding loop. First, we apply
* the Pica charset mapping to the CharBuffer, then
* reset the encoder for the base Charset and call it's
* encode() method to do the actual encoding. The CharBuffer
* passed in may be read-only or re-used by the caller for
* other purposes so we duplicate it and apply the Pica
* encoding to the copy. We do want to advance the position
* of the input buffer to reflect the chars consumed.
*/
protected CoderResult encodeLoop(CharBuffer cb, ByteBuffer bb) {
CharBuffer tmpcb = CharBuffer.allocate(cb.remaining());
while (cb.hasRemaining()) {
tmpcb.put(cb.get());
}
tmpcb.rewind();
for (int pos = tmpcb.position(); pos < tmpcb.limit(); pos++) {
char c = tmpcb.get(pos);
Character mapChar = encodeMap.get(c);
if (mapChar != null) {
tmpcb.put(pos, mapChar);
}
}
baseEncoder.reset();
CoderResult cr = baseEncoder.encode(tmpcb, bb, true);
// If error or output overflow, we need to adjust
// the position of the input buffer to match what
// was really consumed from the temp buffer. If
// underflow (all input consumed) this is a no-op.
cb.position(cb.position() - tmpcb.remaining());
return cr;
}
}
/**
* The decoder implementation for the Pica Charset.
*/
private static class PicaDecoder extends CharsetDecoder {
/**
* Constructor, call the superclass constructor with the
* Charset object and pass alon the chars/byte values
* from the delegate decoder.
*/
PicaDecoder(Charset cs, CharsetDecoder baseDecoder) {
// base decoder only needed for size hints
super(cs, baseDecoder.averageCharsPerByte(),
baseDecoder.maxCharsPerByte());
}
/**
* Implementation of the decoding loop.
*/
protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
while (in.hasRemaining()) {
byte b = in.get();
if (!out.hasRemaining()) {
in.position(in.position() - 1);
return CoderResult.OVERFLOW;
}
char oldChar = (char) (b & 0xFF);
Character mapChar = decodeMap.get(oldChar);
out.put(mapChar != null ? mapChar : oldChar);
}
return CoderResult.UNDERFLOW;
}
}
}