diff --git a/settings.gradle b/settings.gradle index 68d5e81..932b8b2 100644 --- a/settings.gradle +++ b/settings.gradle @@ -30,6 +30,7 @@ dependencyResolutionManagement { } } +include 'sru-client-jdk' include 'z3950-asn1' include 'z3950-api' include 'z3950-common' diff --git a/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/SRUClient.java b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/SRUClient.java new file mode 100644 index 0000000..2eff6aa --- /dev/null +++ b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/SRUClient.java @@ -0,0 +1,92 @@ +package org.xbib.sru.client.jdk; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.util.function.Consumer; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.xbib.sru.client.jdk.util.UrlBuilder; + +public class SRUClient { + + private static final Logger logger = Logger.getLogger(SRUClient.class.getName()); + + private final Builder builder; + + private final HttpClient httpClient; + + private SRUClient(Builder builder) { + this.builder = builder; + this.httpClient = HttpClient.newBuilder() + .followRedirects(HttpClient.Redirect.ALWAYS) + .build(); + } + + public static Builder builder() { + return new Builder(); + } + + public void searchRetrieve(String query, + String recordSchema, + Integer startRecord, + Integer maximumRecords, + Consumer consumer) throws IOException, InterruptedException { + UrlBuilder url = UrlBuilder.fromUrl(builder.baseURL); + url.queryParam(SRUConstants.OPERATION_PARAMETER, "searchRetrieve"); + url.queryParam(SRUConstants.VERSION_PARAMETER, "1.1"); + url.queryParam(SRUConstants.RECORD_SCHEMA_PARAMETER, recordSchema); + url.queryParam(SRUConstants.START_RECORD_PARAMETER, Integer.toString(startRecord)); + url.queryParam(SRUConstants.MAXIMUM_RECORDS_PARAMETER, Integer.toString(maximumRecords)); + url.queryParam(SRUConstants.QUERY_PARAMETER, query); + URI uri = URI.create(url.build().toExternalForm()); + HttpRequest httpRequest = HttpRequest.newBuilder() + .uri(uri) + .header("accept", "utf-8") + .header("user-agent", builder.userAgent != null ? builder.userAgent : "xbib SRU client") + .GET() + .build(); + logger.log(Level.INFO, "sending " + httpRequest); + HttpResponse httpResponse = httpClient.send(httpRequest, HttpResponse.BodyHandlers.ofString()); + int status = httpResponse.statusCode(); + logger.log(Level.FINE, "response status = " + status + " headers = " + httpResponse.headers()); + String contentType = httpResponse.headers().firstValue("content-type").orElse(null); + if (status == 200) { + String string = httpResponse.body(); + if (string != null && string.length() > 0) { + consumer.accept(new StringReader(string)); + } + } + } + + public void close() { + } + + public static class Builder { + + private String baseURL; + + private String userAgent; + + private Builder() { + } + + public Builder setBaseURL(String baseURL) { + this.baseURL = baseURL; + return this; + } + + public Builder setUserAgent(String userAgent) { + this.userAgent = userAgent; + return this; + } + + public SRUClient build() { + return new SRUClient(this); + } + } +} diff --git a/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/SRUConstants.java b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/SRUConstants.java new file mode 100644 index 0000000..133de6a --- /dev/null +++ b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/SRUConstants.java @@ -0,0 +1,17 @@ +package org.xbib.sru.client.jdk; + +public interface SRUConstants { + + String OPERATION_PARAMETER = "operation"; + + String VERSION_PARAMETER = "version"; + + String RECORD_SCHEMA_PARAMETER = "recordSchema"; + + String QUERY_PARAMETER = "query"; + + String START_RECORD_PARAMETER = "startRecord"; + + String MAXIMUM_RECORDS_PARAMETER = "maximumRecords"; + +} diff --git a/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/PercentDecoder.java b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/PercentDecoder.java new file mode 100755 index 0000000..2413ff3 --- /dev/null +++ b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/PercentDecoder.java @@ -0,0 +1,195 @@ +package org.xbib.sru.client.jdk.util; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.nio.charset.MalformedInputException; +import java.nio.charset.UnmappableCharacterException; +import static java.nio.charset.CoderResult.OVERFLOW; +import static java.nio.charset.CoderResult.UNDERFLOW; + +/** + * Decodes percent-encoded (%XX) Unicode text. + */ +public final class PercentDecoder { + + /** + * bytes represented by the current sequence of %-triples. Resized as needed. + */ + private ByteBuffer encodedBuf; + + /** + * Written to with decoded chars by decoder + */ + private final CharBuffer decodedCharBuf; + private final CharsetDecoder decoder; + + /** + * The decoded string for the current input + */ + private final StringBuilder outputBuf = new StringBuilder(); + + /** + * Construct a new PercentDecoder with default buffer sizes. + * + * @param charsetDecoder Charset to decode bytes into chars with + * @see PercentDecoder#PercentDecoder(CharsetDecoder, int, int) + */ + public PercentDecoder(CharsetDecoder charsetDecoder) { + this(charsetDecoder, 16, 16); + } + + /** + * @param charsetDecoder Charset to decode bytes into chars with + * @param initialEncodedByteBufSize Initial size of buffer that holds encoded bytes + * @param decodedCharBufSize Size of buffer that encoded bytes are decoded into + */ + public PercentDecoder(CharsetDecoder charsetDecoder, int initialEncodedByteBufSize, + int decodedCharBufSize) { + encodedBuf = ByteBuffer.allocate(initialEncodedByteBufSize); + decodedCharBuf = CharBuffer.allocate(decodedCharBufSize); + decoder = charsetDecoder; + } + + /** + * @param input Input with %-encoded representation of characters in this instance's configured character set, e.g. + * "%20" for a space character + * @return Corresponding string with %-encoded data decoded and converted to their corresponding characters + * @throws MalformedInputException if decoder is configured to report errors and malformed input is detected + * @throws UnmappableCharacterException if decoder is configured to report errors and an unmappable character is + * detected + */ + public String decode(CharSequence input) throws MalformedInputException, UnmappableCharacterException { + outputBuf.setLength(0); + // this is almost always an underestimate of the size needed: + // only a 4-byte encoding (which is 12 characters input) would case this to be an overestimate + outputBuf.ensureCapacity(input.length() / 8); + encodedBuf.clear(); + + for (int i = 0; i < input.length(); i++) { + char c = input.charAt(i); + if (c != '%') { + handleEncodedBytes(); + + outputBuf.append(c); + continue; + } + + if (i + 2 >= input.length()) { + throw new IllegalArgumentException( + "Could not percent decode <" + input + ">: incomplete %-pair at position " + i); + } + + // grow the byte buf if needed + if (encodedBuf.remaining() == 0) { + ByteBuffer largerBuf = ByteBuffer.allocate(encodedBuf.capacity() * 2); + encodedBuf.flip(); + largerBuf.put(encodedBuf); + encodedBuf = largerBuf; + } + + // note that we advance i here as we consume chars + int msBits = Character.digit(input.charAt(++i), 16); + int lsBits = Character.digit(input.charAt(++i), 16); + + if (msBits == -1 || lsBits == -1) { + throw new IllegalArgumentException("Invalid %-tuple <" + input.subSequence(i - 2, i + 1) + ">"); + } + + msBits <<= 4; + msBits |= lsBits; + + // msBits can only have 8 bits set, so cast is safe + encodedBuf.put((byte) msBits); + } + + handleEncodedBytes(); + + return outputBuf.toString(); + } + + /** + * Decode any buffered encoded bytes and write them to the output buf. + */ + private void handleEncodedBytes() throws MalformedInputException, UnmappableCharacterException { + if (encodedBuf.position() == 0) { + // nothing to do + return; + } + + decoder.reset(); + CoderResult coderResult; + + // switch to reading mode + encodedBuf.flip(); + + // loop while we're filling up the decoded char buf, or there's any encoded bytes + // decode() in practice seems to only consume bytes when it can decode an entire char... + do { + decodedCharBuf.clear(); + coderResult = decoder.decode(encodedBuf, decodedCharBuf, false); + throwIfError(coderResult); + appendDecodedChars(); + } while (coderResult == OVERFLOW && encodedBuf.hasRemaining()); + + // final decode with end-of-input flag + decodedCharBuf.clear(); + coderResult = decoder.decode(encodedBuf, decodedCharBuf, true); + throwIfError(coderResult); + + if (encodedBuf.hasRemaining()) { + throw new IllegalStateException("Final decode didn't error, but didn't consume remaining input bytes"); + } + if (coderResult != UNDERFLOW) { + throw new IllegalStateException("Expected underflow, but instead final decode returned " + coderResult); + } + + appendDecodedChars(); + + // we've finished the input, wrap it up + encodedBuf.clear(); + flush(); + } + + /** + * Must only be called when the input encoded bytes buffer is empty + */ + private void flush() throws MalformedInputException, UnmappableCharacterException { + CoderResult coderResult; + decodedCharBuf.clear(); + + coderResult = decoder.flush(decodedCharBuf); + appendDecodedChars(); + + throwIfError(coderResult); + + if (coderResult != UNDERFLOW) { + throw new IllegalStateException("Decoder flush resulted in " + coderResult); + } + } + + /** + * If coderResult is considered an error (i.e. not overflow or underflow), throw the corresponding + * CharacterCodingException. + * + * @param coderResult result to check + * @throws MalformedInputException if result represents malformed input + * @throws UnmappableCharacterException if result represents an unmappable character + */ + private void throwIfError(CoderResult coderResult) throws MalformedInputException, UnmappableCharacterException { + if (coderResult.isMalformed()) { + throw new MalformedInputException(coderResult.length()); + } + if (coderResult.isUnmappable()) { + throw new UnmappableCharacterException(coderResult.length()); + } } + + /** + * Flip the decoded char buf and append it to the string bug + */ + private void appendDecodedChars() { + decodedCharBuf.flip(); + outputBuf.append(decodedCharBuf); + } +} diff --git a/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/PercentEncoder.java b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/PercentEncoder.java new file mode 100755 index 0000000..b76ae46 --- /dev/null +++ b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/PercentEncoder.java @@ -0,0 +1,186 @@ +package org.xbib.sru.client.jdk.util; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.MalformedInputException; +import java.nio.charset.UnmappableCharacterException; +import java.util.BitSet; +import static java.lang.Character.isHighSurrogate; +import static java.lang.Character.isLowSurrogate; + +/** + * Encodes unsafe characters as a sequence of %XX hex-encoded bytes. + * + * This is typically done when encoding components of URLs. See {@link UrlPercentEncoders} for pre-configured + * PercentEncoder instances. + */ +public final class PercentEncoder { + + private static final char[] HEX_CODE = "0123456789ABCDEF".toCharArray(); + + private final BitSet safeChars; + private final CharsetEncoder encoder; + /** + * Pre-allocate a string handler to make the common case of encoding to a string faster + */ + private final StringBuilderPercentEncoderOutputHandler stringHandler = new StringBuilderPercentEncoderOutputHandler(); + private final ByteBuffer encodedBytes; + private final CharBuffer unsafeCharsToEncode; + + /** + * @param safeChars the set of chars to NOT encode, stored as a bitset with the int positions corresponding to + * those chars set to true. Treated as read only. + * @param charsetEncoder charset encoder to encode characters with. Make sure to not re-use CharsetEncoder instances + * across threads. + */ + public PercentEncoder(BitSet safeChars, CharsetEncoder charsetEncoder) { + this.safeChars = safeChars; + this.encoder = charsetEncoder; + + // why is this a float? sigh. + int maxBytesPerChar = 1 + (int) encoder.maxBytesPerChar(); + // need to handle surrogate pairs, so need to be able to handle 2 chars worth of stuff at once + encodedBytes = ByteBuffer.allocate(maxBytesPerChar * 2); + unsafeCharsToEncode = CharBuffer.allocate(2); + } + + /** + * Encode the input and pass output chars to a handler. + * + * @param input input string + * @param handler handler to call on each output character + * @throws MalformedInputException if encoder is configured to report errors and malformed input is detected + * @throws UnmappableCharacterException if encoder is configured to report errors and an unmappable character is + * detected + */ + public void encode(CharSequence input, StringBuilderPercentEncoderOutputHandler handler) throws + MalformedInputException, UnmappableCharacterException { + + for (int i = 0; i < input.length(); i++) { + + char c = input.charAt(i); + + if (safeChars.get(c)) { + handler.onOutputChar(c); + continue; + } + + // not a safe char + unsafeCharsToEncode.clear(); + unsafeCharsToEncode.append(c); + if (isHighSurrogate(c)) { + if (input.length() > i + 1) { + // get the low surrogate as well + char lowSurrogate = input.charAt(i + 1); + if (isLowSurrogate(lowSurrogate)) { + unsafeCharsToEncode.append(lowSurrogate); + i++; + } else { + throw new IllegalArgumentException( + "Invalid UTF-16: Char " + (i) + " is a high surrogate (\\u" + Integer + .toHexString(c) + "), but char " + (i + 1) + " is not a low surrogate (\\u" + Integer + .toHexString(lowSurrogate) + ")"); + } + } else { + throw new IllegalArgumentException( + "Invalid UTF-16: The last character in the input string was a high surrogate (\\u" + Integer + .toHexString(c) + ")"); + } + } + + flushUnsafeCharBuffer(handler); + } + } + + /** + * Encode the input and return the resulting text as a String. + * + * @param input input string + * @return the input string with every character that's not in safeChars turned into its byte representation via the + * instance's encoder and then percent-encoded + * @throws MalformedInputException if encoder is configured to report errors and malformed input is detected + * @throws UnmappableCharacterException if encoder is configured to report errors and an unmappable character is + * detected + */ + public String encode(CharSequence input) throws MalformedInputException, UnmappableCharacterException { + stringHandler.reset(); + stringHandler.ensureCapacity(input.length()); + encode(input, stringHandler); + return stringHandler.getContents(); + } + + /** + * Encode unsafeCharsToEncode to bytes as per charsetEncoder, then percent-encode those bytes into output. + * + * Side effects: unsafeCharsToEncode will be read from and cleared. encodedBytes will be cleared and written to. + * + */ + private void flushUnsafeCharBuffer(StringBuilderPercentEncoderOutputHandler handler) throws MalformedInputException, + UnmappableCharacterException { + // need to read from the char buffer, which was most recently written to + unsafeCharsToEncode.flip(); + + encodedBytes.clear(); + + encoder.reset(); + CoderResult result = encoder.encode(unsafeCharsToEncode, encodedBytes, true); + checkResult(result); + result = encoder.flush(encodedBytes); + checkResult(result); + + // read contents of bytebuffer + encodedBytes.flip(); + + while (encodedBytes.hasRemaining()) { + byte b = encodedBytes.get(); + handler.onOutputChar('%'); + handler.onOutputChar(HEX_CODE[b >> 4 & 0xF]); + handler.onOutputChar(HEX_CODE[b & 0xF]); + } + } + + /** + * @param result result to check + * @throws IllegalStateException if result is overflow + * @throws MalformedInputException if result represents malformed input + * @throws UnmappableCharacterException if result represents an unmappable character + */ + private static void checkResult(CoderResult result) throws MalformedInputException, UnmappableCharacterException { + if (result.isOverflow()) { + throw new IllegalStateException("Byte buffer overflow; this should not happen."); + } + if (result.isMalformed()) { + throw new MalformedInputException(result.length()); + } + if (result.isUnmappable()) { + throw new UnmappableCharacterException(result.length()); + } + } + + private class StringBuilderPercentEncoderOutputHandler { + + private final StringBuilder stringBuilder; + + StringBuilderPercentEncoderOutputHandler() { + stringBuilder = new StringBuilder(); + } + + String getContents() { + return stringBuilder.toString(); + } + + void reset() { + stringBuilder.setLength(0); + } + + void ensureCapacity(int length) { + stringBuilder.ensureCapacity(length); + } + + void onOutputChar(char c) { + stringBuilder.append(c); + } + } +} diff --git a/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/UrlBuilder.java b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/UrlBuilder.java new file mode 100755 index 0000000..f72e3d5 --- /dev/null +++ b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/UrlBuilder.java @@ -0,0 +1,502 @@ +package org.xbib.sru.client.jdk.util; + +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Pattern; +import static org.xbib.sru.client.jdk.util.UrlPercentEncoders.getFragmentEncoder; +import static org.xbib.sru.client.jdk.util.UrlPercentEncoders.getMatrixEncoder; +import static org.xbib.sru.client.jdk.util.UrlPercentEncoders.getPathEncoder; +import static org.xbib.sru.client.jdk.util.UrlPercentEncoders.getQueryParamEncoder; +import static org.xbib.sru.client.jdk.util.UrlPercentEncoders.getRegNameEncoder; +import static org.xbib.sru.client.jdk.util.UrlPercentEncoders.getUnstructuredQueryEncoder; + +/** + * Builder for urls with url-encoding applied to path, query param, etc. + * + * Escaping rules are from RFC 3986, RFC 1738 and the HTML 4 spec + * This means that this diverges from the canonical URI/URL rules for the sake of being what you want to actually make + * HTTP-useful URLs. + */ +public final class UrlBuilder { + + /** + * IPv6 address, cribbed from StackOverflow + */ + private static final Pattern IPV6_PATTERN = Pattern + .compile( + "\\A\\[((?:[0-9A-Fa-f]{1,4}(?::[0-9A-Fa-f]{1,4})*)?)::((?:[0-9A-Fa-f]{1,4}(?::[0-9A-Fa-f]{1,4})*)?)]\\z"); + + /** + * IPv4 dotted quad + */ + private static final Pattern IPV4_PATTERN = Pattern + .compile("\\A(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}\\z"); + + private final String scheme; + + private final String host; + + private final Integer port; + + private final List> queryParams = new ArrayList<>(); + + /** + * If this is non-null, queryParams must be empty, and vice versa. + */ + private String unstructuredQuery; + + private final List pathSegments = new ArrayList<>(); + + private final PercentEncoder pathEncoder = getPathEncoder(); + private final PercentEncoder regNameEncoder = getRegNameEncoder(); + private final PercentEncoder matrixEncoder = getMatrixEncoder(); + private final PercentEncoder queryParamEncoder = getQueryParamEncoder(); + private final PercentEncoder unstructuredQueryEncoder = getUnstructuredQueryEncoder(); + private final PercentEncoder fragmentEncoder = getFragmentEncoder(); + + private String fragment; + + private boolean forceTrailingSlash = false; + + /** + * Create a URL with UTF-8 encoding. + * + * @param scheme scheme (e.g. http) + * @param host host (e.g. foo.com or 1.2.3.4 or [::1]) + * @param port null or a positive integer + */ + private UrlBuilder(String scheme, String host, Integer port) { + this.host = host; + this.scheme = scheme; + this.port = port; + } + + /** + * Create a URL with an null port and UTF-8 encoding. + * + * @param scheme scheme (e.g. http) + * @param host host in any of the valid syntaxes: reg-name (a dns name), ipv4 literal (1.2.3.4), ipv6 literal + * ([::1]), excluding IPvFuture since no one uses that in practice + * @return a url builder + * @see UrlBuilder#forHost(String scheme, String host, int port) + */ + public static UrlBuilder forHost(String scheme, String host) { + return new UrlBuilder(scheme, host, null); + } + + /** + * @param scheme scheme (e.g. http) + * @param host host in any of the valid syntaxes: reg-name ( a dns name), ipv4 literal (1.2.3.4), ipv6 literal + * ([::1]), excluding IPvFuture since no one uses that in practice + * @param port port + * @return a url builder + */ + public static UrlBuilder forHost(String scheme, String host, int port) { + return new UrlBuilder(scheme, host, port); + } + + public static UrlBuilder fromUrl(String urlSpec) throws CharacterCodingException, MalformedURLException { + return fromUrl(new URL(urlSpec)); + } + + /** + * Calls {@link UrlBuilder#fromUrl(URL, CharsetDecoder)} with a UTF-8 CharsetDecoder. The same semantics about the + * query string apply. + * + * @param url url to initialize builder with + * @return a UrlBuilder containing the host, path, etc. from the url + * @throws CharacterCodingException if char decoding fails + * @see UrlBuilder#fromUrl(URL, CharsetDecoder) + */ + public static UrlBuilder fromUrl(URL url) throws CharacterCodingException { + return fromUrl(url, StandardCharsets.UTF_8.newDecoder()); + } + + /** + * Create a UrlBuilder initialized with the contents of a {@link URL}. + * + * The query string will be parsed into HTML4 query params if it can be separated into a + * &-separated sequence of key=value pairs. The sequence of query params can then be + * appended to by continuing to call {@link UrlBuilder#queryParam(String, String)}. The concept of query params is + * only part of the HTML spec (and common HTTP usage), though, so it's perfectly legal to have a query string that + * is in some other form. To represent this case, if the aforementioned param-parsing attempt fails, the query + * string will be treated as just a monolithic, unstructured, string. In this case, calls to {@link + * UrlBuilder#queryParam(String, String)} on the resulting instance will throw IllegalStateException, and only calls + * to {@link UrlBuilder#unstructuredQuery(String)}}, which replaces the entire query string, are allowed. + * + * @param url url to initialize builder with + * @param charsetDecoder the decoder to decode encoded bytes with (except for reg names, which are always UTF-8) + * @return a UrlBuilder containing the host, path, etc. from the url + * @throws CharacterCodingException if decoding percent-encoded bytes fails and charsetDecoder is configured to + * report errors + * @see UrlBuilder#fromUrl(URL, CharsetDecoder) + */ + public static UrlBuilder fromUrl(URL url, CharsetDecoder charsetDecoder) throws + CharacterCodingException { + + PercentDecoder decoder = new PercentDecoder(charsetDecoder); + // reg names must be encoded UTF-8 + PercentDecoder regNameDecoder; + if (charsetDecoder.charset().equals(StandardCharsets.UTF_8)) { + regNameDecoder = decoder; + } else { + regNameDecoder = new PercentDecoder(StandardCharsets.UTF_8.newDecoder()); + } + + Integer port = url.getPort(); + if (port == -1) { + port = null; + } + + UrlBuilder builder = new UrlBuilder(url.getProtocol(), regNameDecoder.decode(url.getHost()), port); + + buildFromPath(builder, decoder, url); + + buildFromQuery(builder, decoder, url); + + if (url.getRef() != null) { + builder.fragment(decoder.decode(url.getRef())); + } + + return builder; + } + + /** + * Add a path segment. + * + * @param segment a path segment + * @return this + */ + public UrlBuilder pathSegment(String segment) { + pathSegments.add(new PathSegment(segment)); + return this; + } + + /** + * Add multiple path segments. Equivalent to successive calls to {@link UrlBuilder#pathSegment(String)}. + * + * @param segments path segments + * @return this + */ + public UrlBuilder pathSegments(String... segments) { + for (String segment : segments) { + pathSegment(segment); + } + + return this; + } + + /** + * Add an HTML query parameter. Query parameters will be encoded in the order added. + * + * Using query strings to encode key=value pairs is not part of the URI/URL specification; it is specified by + * HTML 4. + * + * If you use this method to build a query string, or created this builder from a url with a query string that can + * successfully be parsed into query param pairs, you cannot subsequently use {@link + * UrlBuilder#unstructuredQuery(String)}. See {@link UrlBuilder#fromUrl(URL, CharsetDecoder)}. + * + * @param name param name + * @param value param value + * @return this + */ + public UrlBuilder queryParam(String name, String value) { + if (unstructuredQuery != null) { + throw new IllegalStateException( + "Cannot call queryParam() when this already has an unstructured query specified"); + } + + queryParams.add(Pair.of(name, value)); + return this; + } + + /** + * Set the complete query string of arbitrary structure. This is useful when you want to specify a query string that + * is not of key=value format. If the query has previously been set via this method, subsequent calls will overwrite + * that query. + * + * If you use this method, or create a builder from a URL whose query is not parseable into query param pairs, you + * cannot subsequently use {@link UrlBuilder#queryParam(String, String)}. See {@link UrlBuilder#fromUrl(URL, + * CharsetDecoder)}. + * + * @param query Complete URI query, as specified by RFC 3986. + * @return this + */ + public UrlBuilder unstructuredQuery(String query) { + if (!queryParams.isEmpty()) { + throw new IllegalStateException( + "Cannot call unstructuredQuery() when this already has queryParam pairs specified"); + } + + unstructuredQuery = query; + + return this; + } + + /** + * Clear the unstructured query and any query params. + * + * Since the query / query param situation is a little complicated, this method will let you remove all query + * information and start again from scratch. This may be useful when taking an existing url, parsing it into a + * builder, and then re-doing its query params, for instance. + * + * @return this + */ + public UrlBuilder clearQuery() { + queryParams.clear(); + unstructuredQuery = null; + + return this; + } + + /** + * Add a matrix param to the last added path segment. If no segments have been added, the param will be added to the + * root. Matrix params will be encoded in the order added. + * + * @param name param name + * @param value param value + * @return this + */ + public UrlBuilder matrixParam(String name, String value) { + if (pathSegments.isEmpty()) { + // create an empty path segment to represent a matrix param applied to the root + pathSegment(""); + } + + PathSegment seg = pathSegments.get(pathSegments.size() - 1); + seg.matrixParams.add(Pair.of(name, value)); + return this; + } + + /** + * Set the fragment. + * + * @param fragment fragment string + * @return this + */ + public UrlBuilder fragment(String fragment) { + this.fragment = fragment; + return this; + } + + /** + * Force the generated URL to have a trailing slash at the end of the path. + * + * @return this + */ + public UrlBuilder forceTrailingSlash() { + forceTrailingSlash = true; + return this; + } + + public URL build() throws CharacterCodingException, MalformedURLException { + return new URL(toUrlString()); + } + + /** + * Encode the current builder state into a URL string. + * + * @return a well-formed URL string + * @throws CharacterCodingException if character encoding fails and the encoder is configured to report errors + */ + public String toUrlString() throws CharacterCodingException { + StringBuilder buf = new StringBuilder(); + + buf.append(scheme); + buf.append("://"); + + buf.append(encodeHost(host)); + if (port != null) { + buf.append(':'); + buf.append(port); + } + + for (PathSegment pathSegment : pathSegments) { + buf.append('/'); + buf.append(pathEncoder.encode(pathSegment.segment)); + + for (Pair matrixParam : pathSegment.matrixParams) { + buf.append(';'); + buf.append(matrixEncoder.encode(matrixParam.getKey())); + buf.append('='); + buf.append(matrixEncoder.encode(matrixParam.getValue())); + } + } + + if (forceTrailingSlash) { + buf.append('/'); + } + + if (!queryParams.isEmpty()) { + buf.append("?"); + Iterator> qpIter = queryParams.iterator(); + while (qpIter.hasNext()) { + Pair queryParam = qpIter.next(); + buf.append(queryParamEncoder.encode(queryParam.getKey())); + buf.append('='); + buf.append(queryParamEncoder.encode(queryParam.getValue())); + if (qpIter.hasNext()) { + buf.append('&'); + } + } + } else if (unstructuredQuery != null) { + buf.append("?"); + buf.append(unstructuredQueryEncoder.encode(unstructuredQuery)); + } + + if (fragment != null) { + buf.append('#'); + buf.append(fragmentEncoder.encode(fragment)); + } + + return buf.toString(); + } + + /** + * Populate a url builder based on the query of a url + * + * @param builder builder + * @param decoder decoder + * @param url url + * @throws CharacterCodingException + */ + private static void buildFromQuery(UrlBuilder builder, PercentDecoder decoder, URL url) throws + CharacterCodingException { + if (url.getQuery() != null) { + String q = url.getQuery(); + + // try to parse into &-separated key=value pairs + List> pairs = new ArrayList<>(); + boolean parseOk = true; + + for (String queryChunk : q.split("&")) { + String[] queryParamChunks = queryChunk.split("="); + + if (queryParamChunks.length != 2) { + parseOk = false; + break; + } + + pairs.add(Pair.of(decoder.decode(queryParamChunks[0]), + decoder.decode(queryParamChunks[1]))); + } + + if (parseOk) { + for (Pair pair : pairs) { + builder.queryParam(pair.getKey(), pair.getValue()); + } + } else { + builder.unstructuredQuery(decoder.decode(q)); + } + } + } + + /** + * Populate the path segments of a url builder from a url + * + * @param builder builder + * @param decoder decoder + * @param url url + * @throws CharacterCodingException + */ + private static void buildFromPath(UrlBuilder builder, PercentDecoder decoder, URL url) throws + CharacterCodingException { + for (String pathChunk : url.getPath().split("/")) { + if (pathChunk.equals("")) { + continue; + } + + if (pathChunk.charAt(0) == ';') { + builder.pathSegment(""); + // empty path segment, but matrix params + for (String matrixChunk : pathChunk.substring(1).split(";")) { + buildFromMatrixParamChunk(decoder, builder, matrixChunk); + } + + continue; + } + + // otherwise, path chunk is non empty and does not start with a ';' + + String[] matrixChunks = pathChunk.split(";"); + + // first chunk is always the path segment. If there is a trailing ; and no matrix params, the ; will + // not be included in the final url. + builder.pathSegment(decoder.decode(matrixChunks[0])); + + // if there any other chunks, they're matrix param pairs + for (int i = 1; i < matrixChunks.length; i++) { + buildFromMatrixParamChunk(decoder, builder, matrixChunks[i]); + } + } + } + + private static void buildFromMatrixParamChunk(PercentDecoder decoder, UrlBuilder ub, String pathMatrixChunk) throws + CharacterCodingException { + String[] mtxPair = pathMatrixChunk.split("="); + if (mtxPair.length != 2) { + throw new IllegalArgumentException("Malformed matrix param: <" + pathMatrixChunk + ">"); + } + + String mtxName = mtxPair[0]; + String mtxVal = mtxPair[1]; + ub.matrixParam(decoder.decode(mtxName), decoder.decode(mtxVal)); + } + + /** + * @param host original host string + * @return host encoded as in RFC 3986 section 3.2.2 + */ + private String encodeHost(String host) throws CharacterCodingException { + // matching order: IP-literal, IPv4, reg-name + if (IPV4_PATTERN.matcher(host).matches() || IPV6_PATTERN.matcher(host).matches()) { + return host; + } + + // it's a reg-name, which MUST be encoded as UTF-8 (regardless of the rest of the URL) + return regNameEncoder.encode(host); + } + + /** + * Bundle of a path segment name and any associated matrix params. + */ + private static class PathSegment { + private final String segment; + private final List> matrixParams = new ArrayList<>(); + + PathSegment(String segment) { + this.segment = segment; + } + } + + static class Pair { + + K key; + + V value; + + Pair(K key, V value) { + this.key = key; + this.value = value; + } + + static Pair of(K key, V value) { + return new Pair<>(key, value); + } + + K getKey() { + return key; + } + + V getValue() { + return value; + } + + } +} diff --git a/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/UrlPercentEncoders.java b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/UrlPercentEncoders.java new file mode 100755 index 0000000..84f1f88 --- /dev/null +++ b/sru-client-jdk/src/main/java/org/xbib/sru/client/jdk/util/UrlPercentEncoders.java @@ -0,0 +1,164 @@ +package org.xbib.sru.client.jdk.util; + +import java.nio.charset.StandardCharsets; +import java.util.BitSet; +import static java.nio.charset.CodingErrorAction.REPLACE; + +/** + * See RFC 3986, RFC 1738 and .... + */ +public final class UrlPercentEncoders { + + /** + * an encoder for RFC 3986 reg-names + */ + + private static final BitSet REG_NAME_BIT_SET = new BitSet(); + + private static final BitSet PATH_BIT_SET = new BitSet(); + private static final BitSet MATRIX_BIT_SET = new BitSet(); + private static final BitSet UNSTRUCTURED_QUERY_BIT_SET = new BitSet(); + private static final BitSet QUERY_PARAM_BIT_SET = new BitSet(); + private static final BitSet FRAGMENT_BIT_SET = new BitSet(); + + static { + // RFC 3986 'reg-name'. This is not very aggressive... it's quite possible to have DNS-illegal names out of this. + // Regardless, it will at least be URI-compliant even if it's not HTTP URL-compliant. + addUnreserved(REG_NAME_BIT_SET); + addSubdelims(REG_NAME_BIT_SET); + + // Represents RFC 3986 'pchar'. Remove delimiter that starts matrix section. + addPChar(PATH_BIT_SET); + PATH_BIT_SET.clear((int) ';'); + + // Remove delims for HTTP matrix params as per RFC 1738 S3.3. The other reserved chars ('/' and '?') are already excluded. + addPChar(MATRIX_BIT_SET); + MATRIX_BIT_SET.clear((int) ';'); + MATRIX_BIT_SET.clear((int) '='); + + /* + * At this point it represents RFC 3986 'query'. http://www.w3.org/TR/html4/interact/forms.html#h-17.13.4.1 also + * specifies that "+" can mean space in a query, so we will make sure to say that '+' is not safe to leave as-is + */ + addQuery(UNSTRUCTURED_QUERY_BIT_SET); + UNSTRUCTURED_QUERY_BIT_SET.clear((int) '+'); + + /* + * Create more stringent requirements for HTML4 queries: remove delimiters for HTML query params so that key=value + * pairs can be used. + */ + QUERY_PARAM_BIT_SET.or(UNSTRUCTURED_QUERY_BIT_SET); + QUERY_PARAM_BIT_SET.clear((int) '='); + QUERY_PARAM_BIT_SET.clear((int) '&'); + + addFragment(FRAGMENT_BIT_SET); + } + + public static PercentEncoder getRegNameEncoder() { + return new PercentEncoder(REG_NAME_BIT_SET, StandardCharsets.UTF_8.newEncoder().onMalformedInput(REPLACE) + .onUnmappableCharacter(REPLACE)); + } + + public static PercentEncoder getPathEncoder() { + return new PercentEncoder(PATH_BIT_SET, StandardCharsets.UTF_8.newEncoder().onMalformedInput(REPLACE) + .onUnmappableCharacter(REPLACE)); + } + + public static PercentEncoder getMatrixEncoder() { + return new PercentEncoder(MATRIX_BIT_SET, StandardCharsets.UTF_8.newEncoder().onMalformedInput(REPLACE) + .onUnmappableCharacter(REPLACE)); + } + + public static PercentEncoder getUnstructuredQueryEncoder() { + return new PercentEncoder(UNSTRUCTURED_QUERY_BIT_SET, StandardCharsets.UTF_8.newEncoder().onMalformedInput(REPLACE) + .onUnmappableCharacter(REPLACE)); + } + + public static PercentEncoder getQueryParamEncoder() { + return new PercentEncoder(QUERY_PARAM_BIT_SET, StandardCharsets.UTF_8.newEncoder().onMalformedInput(REPLACE) + .onUnmappableCharacter(REPLACE)); + } + + public static PercentEncoder getFragmentEncoder() { + return new PercentEncoder(FRAGMENT_BIT_SET, StandardCharsets.UTF_8.newEncoder().onMalformedInput(REPLACE) + .onUnmappableCharacter(REPLACE)); + } + + private UrlPercentEncoders() { + } + + /** + * Add code points for 'fragment' chars + * + * @param fragmentBitSet bit set + */ + private static void addFragment(BitSet fragmentBitSet) { + addPChar(fragmentBitSet); + fragmentBitSet.set((int) '/'); + fragmentBitSet.set((int) '?'); + } + + /** + * Add code points for 'query' chars + * + * @param queryBitSet bit set + */ + private static void addQuery(BitSet queryBitSet) { + addPChar(queryBitSet); + queryBitSet.set((int) '/'); + queryBitSet.set((int) '?'); + } + + /** + * Add code points for 'pchar' chars. + * + * @param bs bitset + */ + private static void addPChar(BitSet bs) { + addUnreserved(bs); + addSubdelims(bs); + bs.set((int) ':'); + bs.set((int) '@'); + } + + /** + * Add codepoints for 'unreserved' chars + * + * @param bs bitset to add codepoints to + */ + private static void addUnreserved(BitSet bs) { + + for (int i = 'a'; i <= 'z'; i++) { + bs.set(i); + } + for (int i = 'A'; i <= 'Z'; i++) { + bs.set(i); + } + for (int i = '0'; i <= '9'; i++) { + bs.set(i); + } + bs.set((int) '-'); + bs.set((int) '.'); + bs.set((int) '_'); + bs.set((int) '~'); + } + + /** + * Add codepoints for 'sub-delims' chars + * + * @param bs bitset to add codepoints to + */ + private static void addSubdelims(BitSet bs) { + bs.set((int) '!'); + bs.set((int) '$'); + bs.set((int) '&'); + bs.set((int) '\''); + bs.set((int) '('); + bs.set((int) ')'); + bs.set((int) '*'); + bs.set((int) '+'); + bs.set((int) ','); + bs.set((int) ';'); + bs.set((int) '='); + } +} diff --git a/z3950-api/src/main/java/org/xbib/z3950/api/InitListener.java b/z3950-api/src/main/java/org/xbib/z3950/api/InitListener.java index 0a6e212..d6c7dee 100644 --- a/z3950-api/src/main/java/org/xbib/z3950/api/InitListener.java +++ b/z3950-api/src/main/java/org/xbib/z3950/api/InitListener.java @@ -1,8 +1,5 @@ package org.xbib.z3950.api; -/** - * - */ @FunctionalInterface public interface InitListener { diff --git a/z3950-api/src/main/java/org/xbib/z3950/api/RecordListener.java b/z3950-api/src/main/java/org/xbib/z3950/api/RecordListener.java index f619819..3fb79b8 100644 --- a/z3950-api/src/main/java/org/xbib/z3950/api/RecordListener.java +++ b/z3950-api/src/main/java/org/xbib/z3950/api/RecordListener.java @@ -1,8 +1,5 @@ package org.xbib.z3950.api; -/** - * - */ @FunctionalInterface public interface RecordListener { diff --git a/z3950-api/src/main/java/org/xbib/z3950/api/ScanListener.java b/z3950-api/src/main/java/org/xbib/z3950/api/ScanListener.java index 3bd8cd3..b2582c5 100644 --- a/z3950-api/src/main/java/org/xbib/z3950/api/ScanListener.java +++ b/z3950-api/src/main/java/org/xbib/z3950/api/ScanListener.java @@ -2,9 +2,6 @@ package org.xbib.z3950.api; import org.xbib.asn1.BEREncoding; -/** - * - */ @FunctionalInterface public interface ScanListener { diff --git a/z3950-api/src/main/java/org/xbib/z3950/api/SearchListener.java b/z3950-api/src/main/java/org/xbib/z3950/api/SearchListener.java index 66d0ad4..c46a67b 100644 --- a/z3950-api/src/main/java/org/xbib/z3950/api/SearchListener.java +++ b/z3950-api/src/main/java/org/xbib/z3950/api/SearchListener.java @@ -2,9 +2,6 @@ package org.xbib.z3950.api; import java.io.IOException; -/** - * - */ @FunctionalInterface public interface SearchListener { diff --git a/z3950-groovy/src/test/groovy/org/xbib/z3950/groovy/LVITest.groovy b/z3950-groovy/src/test/groovy/org/xbib/z3950/groovy/LVITest.groovy index 2902711..9fdceda 100644 --- a/z3950-groovy/src/test/groovy/org/xbib/z3950/groovy/LVITest.groovy +++ b/z3950-groovy/src/test/groovy/org/xbib/z3950/groovy/LVITest.groovy @@ -31,7 +31,7 @@ class LVITest { //String query = "@attr 1=12 \"(DE-101)1016677359\"" // record ID plus prefix OK! //String query = "@attr 1=12 \"(DE-600)2635378-7\"" // ZDB ID plus prefix NOT OK! //String query = "@attr 1=1016 \"2020\"" // any OK! - //String query = "@attr 1=1052 12-7" // ZDB-ID OK! + String query = "@attr 1=1052 12-7" // ZDB-ID OK! String preferredRecordSyntax = "marc21" int from = 1 int size = 1