/* * Licensed to Jörg Prante and xbib under one or more contributor * license agreements. See the NOTICE.txt file distributed with this work * for additional information regarding copyright ownership. * * Copyright (C) 2016 Jörg Prante and xbib * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program; if not, see http://www.gnu.org/licenses * or write to the Free Software Foundation, Inc., 51 Franklin Street, * Fifth Floor, Boston, MA 02110-1301 USA. * * The interactive user interfaces in modified source and object code * versions of this program must display Appropriate Legal Notices, * as required under Section 5 of the GNU Affero General Public License. * * In accordance with Section 7(b) of the GNU Affero General Public * License, these Appropriate Legal Notices must retain the display of the * "Powered by xbib" logo. If the display of the logo is not reasonably * feasible for technical reasons, the Appropriate Legal Notices must display * the words "Powered by xbib". */ package org.xbib.charset; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; /** * PICA character set implementation. * * This character set is a modified version of the 'InterMARC' character set * and contains 256 tokens. * * A description can be found at * the Pica website. */ public class Pica extends Charset { private static final Map encodeMap = new HashMap<>(); private static final Map decodeMap = new HashMap<>(); /* * Pica character mapping for index subset \u00a0..\u00ff. * Pica is equal to US-ASCII but not ISO-8859-1. * These are the definitions for Pica characters * which are different from ISO-8859-1. */ static { Pica.charTable(new char[] { '\u00a0', '\u0141', '\u00d8', '\u0110', '\u00de', '\u00c6', '\u0152', '\u02b9', '\u00b7', '\u266d', '\u00ae', '\u00b1', '\u01a0', '\u01af', '\u02be', '\u00c5', '\u02bf', '\u0142', '\u00f8', '\u0111', '\u00fe', '\u00e6', '\u0153', '\u02ba', '\u0131', '\u00a3', '\u00f0', '\u03b1', '\u01a1', '\u01b0', '\u00df', '\u00e5', '\u0132', '\u00c4', '\u00d6', '\u00dc', '\u0186', '\u018e', '\u2260', '\u2192', '\u2264', '\u221e', '\u222b', '\u00d7', '\u00a7', '\u22a1', '\u21d4', '\u2265', '\u0133', '\u00e4', '\u00f6', '\u00fc', '\u0254', '\u0258', '\u00bf', '\u00a1', '\u03b2', '\u003f', '\u03b3', '\u03c0', '\u003f', '\u003f', '\u003f', '\u003f', '\u0341', '\u0300', '\u0301', '\u0302', '\u0303', '\u0304', '\u0306', '\u0307', '\u0308', '\u030c', '\u030a', '\ufe20', '\ufe21', '\u0315', '\u030b', '\u0310', '\u0327', '\u0000', '\u0323', '\u0324', '\u0325', '\u0333', '\u0332', '\u003f', '\u031c', '\u032e', '\ufe23', '\ufe22', '\u003f', '\u0000', '\u0313', '\u003f' }); } // Handle to the real charset we'll use for transcoding between // characters and bytes. Doing this allows applying the Pica // charset to multi-byte charset encodings like UTF-8. private final Charset encodeCharset; /** * Constructor for the Pica charset. Call the superclass * constructor to pass along the name(s) we'll be known by. * Then save a reference to the delegate Charset. */ public Pica() { super("PICA", BibliographicCharsetProvider.aliasesFor("PICA")); encodeCharset = StandardCharsets.ISO_8859_1; } /** * Fill the conversion tables. */ private static void charTable(char[] code) { int i = 0; for (char c = '\u00a0'; c <= '\u00ff'; c++) { if (code[i] != '\u0000') { encodeMap.put(code[i], c); decodeMap.put(c, code[i]); } i++; } } /** * This method must be implemented by concrete Charsets. We allow * subclasses of the Pica charset. */ public boolean contains(Charset charset) { return charset instanceof Pica; } /** * Called by users of this Charset to obtain an encoder. * This implementation instantiates an instance of a private class * (defined below) and passes it an encoder from the base Charset. */ public CharsetEncoder newEncoder() { return new PicaEncoder(this, encodeCharset.newEncoder()); } /** * Called by users of this Charset to obtain a decoder. * This implementation instantiates an instance of a private class * (defined below) and passes it a decoder from the base Charset. */ public CharsetDecoder newDecoder() { return new PicaDecoder(this, encodeCharset.newDecoder()); } private static class PicaEncoder extends CharsetEncoder { private final CharsetEncoder baseEncoder; /** * Constructor, call the superclass constructor with the * Charset object and the encodings sizes from the * delegate encoder. */ PicaEncoder(Charset cs, CharsetEncoder baseEncoder) { super(cs, baseEncoder.averageBytesPerChar(), baseEncoder.maxBytesPerChar()); this.baseEncoder = baseEncoder; } /** * Implementation of the encoding loop. First, we apply * the Pica charset mapping to the CharBuffer, then * reset the encoder for the base Charset and call it's * encode() method to do the actual encoding. The CharBuffer * passed in may be read-only or re-used by the caller for * other purposes so we duplicate it and apply the Pica * encoding to the copy. We do want to advance the position * of the input buffer to reflect the chars consumed. */ protected CoderResult encodeLoop(CharBuffer cb, ByteBuffer bb) { CharBuffer tmpcb = CharBuffer.allocate(cb.remaining()); while (cb.hasRemaining()) { tmpcb.put(cb.get()); } tmpcb.rewind(); for (int pos = tmpcb.position(); pos < tmpcb.limit(); pos++) { char c = tmpcb.get(pos); Character mapChar = encodeMap.get(c); if (mapChar != null) { tmpcb.put(pos, mapChar); } } baseEncoder.reset(); CoderResult cr = baseEncoder.encode(tmpcb, bb, true); // If error or output overflow, we need to adjust // the position of the input buffer to match what // was really consumed from the temp buffer. If // underflow (all input consumed) this is a no-op. cb.position(cb.position() - tmpcb.remaining()); return cr; } } /** * The decoder implementation for the Pica Charset. */ private static class PicaDecoder extends CharsetDecoder { /** * Constructor, call the superclass constructor with the * Charset object and pass alon the chars/byte values * from the delegate decoder. */ PicaDecoder(Charset cs, CharsetDecoder baseDecoder) { // base decoder only needed for size hints super(cs, baseDecoder.averageCharsPerByte(), baseDecoder.maxCharsPerByte()); } /** * Implementation of the decoding loop. */ protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) { while (in.hasRemaining()) { byte b = in.get(); if (!out.hasRemaining()) { in.position(in.position() - 1); return CoderResult.OVERFLOW; } char oldChar = (char) (b & 0xFF); Character mapChar = decodeMap.get(oldChar); out.put(mapChar != null ? mapChar : oldChar); } return CoderResult.UNDERFLOW; } } }