diff --git a/gradle.properties b/gradle.properties index 7f9ea7d..90db68b 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ group = org.xbib name = net -version = 3.0.0 +version = 3.0.1 org.gradle.warning.mode = ALL diff --git a/net-resource/NOTICE.txt b/net-resource/NOTICE.txt new file mode 100644 index 0000000..38e345d --- /dev/null +++ b/net-resource/NOTICE.txt @@ -0,0 +1,3 @@ +This IRI implementation is taken from Daniel Fuchs' writeup for java.net.IRI + +http://cr.openjdk.java.net/%7Edfuchs/writeups/updating-uri/ diff --git a/net-resource/src/main/java/module-info.java b/net-resource/src/main/java/module-info.java new file mode 100644 index 0000000..da25621 --- /dev/null +++ b/net-resource/src/main/java/module-info.java @@ -0,0 +1,4 @@ + +module org.xbib.net.resource { + exports org.xbib.net.resource; +} diff --git a/net-resource/src/main/java/org/xbib/net/resource/IPAddressUtil.java b/net-resource/src/main/java/org/xbib/net/resource/IPAddressUtil.java new file mode 100644 index 0000000..2c935fd --- /dev/null +++ b/net-resource/src/main/java/org/xbib/net/resource/IPAddressUtil.java @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2004, 2005, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package org.xbib.net.resource; + +public class IPAddressUtil { + private final static int INADDR4SZ = 4; + private final static int INADDR16SZ = 16; + private final static int INT16SZ = 2; + + /* + * Converts IPv4 address in its textual presentation form + * into its numeric binary form. + * + * @param src a String representing an IPv4 address in standard format + * @return a byte array representing the IPv4 numeric address + */ + @SuppressWarnings("fallthrough") + public static byte[] textToNumericFormatV4(String src) + { + byte[] res = new byte[INADDR4SZ]; + + long tmpValue = 0; + int currByte = 0; + + int len = src.length(); + if (len == 0 || len > 15) { + return null; + } + /* + * When only one part is given, the value is stored directly in + * the network address without any byte rearrangement. + * + * When a two part address is supplied, the last part is + * interpreted as a 24-bit quantity and placed in the right + * most three bytes of the network address. This makes the + * two part address format convenient for specifying Class A + * network addresses as net.host. + * + * When a three part address is specified, the last part is + * interpreted as a 16-bit quantity and placed in the right + * most two bytes of the network address. This makes the + * three part address format convenient for specifying + * Class B net- work addresses as 128.net.host. + * + * When four parts are specified, each is interpreted as a + * byte of data and assigned, from left to right, to the + * four bytes of an IPv4 address. + * + * We determine and parse the leading parts, if any, as single + * byte values in one pass directly into the resulting byte[], + * then the remainder is treated as a 8-to-32-bit entity and + * translated into the remaining bytes in the array. + */ + for (int i = 0; i < len; i++) { + char c = src.charAt(i); + if (c == '.') { + if (tmpValue < 0 || tmpValue > 0xff || currByte == 3) { + return null; + } + res[currByte++] = (byte) (tmpValue & 0xff); + tmpValue = 0; + } else { + int digit = Character.digit(c, 10); + if (digit < 0) { + return null; + } + tmpValue *= 10; + tmpValue += digit; + } + } + if (tmpValue < 0 || tmpValue >= (1L << ((4 - currByte) * 8))) { + return null; + } + switch (currByte) { + case 0: + res[0] = (byte) ((tmpValue >> 24) & 0xff); + case 1: + res[1] = (byte) ((tmpValue >> 16) & 0xff); + case 2: + res[2] = (byte) ((tmpValue >> 8) & 0xff); + case 3: + res[3] = (byte) ((tmpValue >> 0) & 0xff); + } + return res; + } + + /* + * Convert IPv6 presentation level address to network order binary form. + * credit: + * Converted from C code from Solaris 8 (inet_pton) + * + * Any component of the string following a per-cent % is ignored. + * + * @param src a String representing an IPv6 address in textual format + * @return a byte array representing the IPv6 numeric address + */ + public static byte[] textToNumericFormatV6(String src) + { + // Shortest valid string is "::", hence at least 2 chars + if (src.length() < 2) { + return null; + } + + int colonp; + char ch; + boolean saw_xdigit; + int val; + char[] srcb = src.toCharArray(); + byte[] dst = new byte[INADDR16SZ]; + + int srcb_length = srcb.length; + int pc = src.indexOf ("%"); + if (pc == srcb_length -1) { + return null; + } + + if (pc != -1) { + srcb_length = pc; + } + + colonp = -1; + int i = 0, j = 0; + /* Leading :: requires some special handling. */ + if (srcb[i] == ':') + if (srcb[++i] != ':') + return null; + int curtok = i; + saw_xdigit = false; + val = 0; + while (i < srcb_length) { + ch = srcb[i++]; + int chval = Character.digit(ch, 16); + if (chval != -1) { + val <<= 4; + val |= chval; + if (val > 0xffff) + return null; + saw_xdigit = true; + continue; + } + if (ch == ':') { + curtok = i; + if (!saw_xdigit) { + if (colonp != -1) + return null; + colonp = j; + continue; + } else if (i == srcb_length) { + return null; + } + if (j + INT16SZ > INADDR16SZ) + return null; + dst[j++] = (byte) ((val >> 8) & 0xff); + dst[j++] = (byte) (val & 0xff); + saw_xdigit = false; + val = 0; + continue; + } + if (ch == '.' && ((j + INADDR4SZ) <= INADDR16SZ)) { + String ia4 = src.substring(curtok, srcb_length); + /* check this IPv4 address has 3 dots, ie. A.B.C.D */ + int dot_count = 0, index=0; + while ((index = ia4.indexOf ('.', index)) != -1) { + dot_count ++; + index ++; + } + if (dot_count != 3) { + return null; + } + byte[] v4addr = textToNumericFormatV4(ia4); + if (v4addr == null) { + return null; + } + for (int k = 0; k < INADDR4SZ; k++) { + dst[j++] = v4addr[k]; + } + saw_xdigit = false; + break; /* '\0' was seen by inet_pton4(). */ + } + return null; + } + if (saw_xdigit) { + if (j + INT16SZ > INADDR16SZ) + return null; + dst[j++] = (byte) ((val >> 8) & 0xff); + dst[j++] = (byte) (val & 0xff); + } + + if (colonp != -1) { + int n = j - colonp; + + if (j == INADDR16SZ) + return null; + for (i = 1; i <= n; i++) { + dst[INADDR16SZ - i] = dst[colonp + n - i]; + dst[colonp + n - i] = 0; + } + j = INADDR16SZ; + } + if (j != INADDR16SZ) + return null; + byte[] newdst = convertFromIPv4MappedAddress(dst); + if (newdst != null) { + return newdst; + } else { + return dst; + } + } + + /** + * @param src a String representing an IPv4 address in textual format + * @return a boolean indicating whether src is an IPv4 literal address + */ + public static boolean isIPv4LiteralAddress(String src) { + return textToNumericFormatV4(src) != null; + } + + /** + * @param src a String representing an IPv6 address in textual format + * @return a boolean indicating whether src is an IPv6 literal address + */ + public static boolean isIPv6LiteralAddress(String src) { + return textToNumericFormatV6(src) != null; + } + + /* + * Convert IPv4-Mapped address to IPv4 address. Both input and + * returned value are in network order binary form. + * + * @param src a String representing an IPv4-Mapped address in textual format + * @return a byte array representing the IPv4 numeric address + */ + public static byte[] convertFromIPv4MappedAddress(byte[] addr) { + if (isIPv4MappedAddress(addr)) { + byte[] newAddr = new byte[INADDR4SZ]; + System.arraycopy(addr, 12, newAddr, 0, INADDR4SZ); + return newAddr; + } + return null; + } + + /** + * Utility routine to check if the InetAddress is an + * IPv4 mapped IPv6 address. + * + * @return a boolean indicating if the InetAddress is + * an IPv4 mapped IPv6 address; or false if address is IPv4 address. + */ + private static boolean isIPv4MappedAddress(byte[] addr) { + if (addr.length < INADDR16SZ) { + return false; + } + if ((addr[0] == 0x00) && (addr[1] == 0x00) && + (addr[2] == 0x00) && (addr[3] == 0x00) && + (addr[4] == 0x00) && (addr[5] == 0x00) && + (addr[6] == 0x00) && (addr[7] == 0x00) && + (addr[8] == 0x00) && (addr[9] == 0x00) && + (addr[10] == (byte)0xff) && + (addr[11] == (byte)0xff)) { + return true; + } + return false; + } +} diff --git a/net-resource/src/main/java/org/xbib/net/resource/IRI.java b/net-resource/src/main/java/org/xbib/net/resource/IRI.java new file mode 100644 index 0000000..6764f7e --- /dev/null +++ b/net-resource/src/main/java/org/xbib/net/resource/IRI.java @@ -0,0 +1,5780 @@ +/* + * Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package org.xbib.net.resource; + +import java.io.File; +import java.io.IOException; +import java.io.InvalidObjectException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.ObjectStreamException; +import java.io.Serializable; +import java.net.IDN; +import java.net.MalformedURLException; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.text.Normalizer; + +/** + * Represents a Uniform Resource Identifier (URI) reference or an + * Internationalized Resource Identifier (IRI) reference. + * + *

Every Uniform Resource Identifier ( a URI specified + * by RFC 3986 ) + * is by definition an IRI. This class, and the operations within, can + * therefore be used generally to represent and manipulate a Uniform + * Resource Identifier, as specified by RFC 3986. + * This class should be used in preference to {@link java.net.URI + * java.net.URI} where possible, even if the resource identifier does not + * contain internationalized characters. + *

This class, and its operations, supersede that of {@link java.net.URI + * java.net.URI} + * + *

Aside from some minor deviations noted below, an instance of this + * class represents a URI reference as defined by + * RFC 3986: Uniform + * Resource Identifiers (URI): Generic Syntax, or an IRI reference + * as defined by + * RFC 3987: Internationalized Resource Identifiers (IRIs). + * IRIs are defined similarly to URIs, except that the permitted characters + * have been extended by adding the characters of the + * UCS (Universal Character Set, ISO10646). + * + * This class provides factory methods for creating IRI instances from + * their components or by parsing their string forms, methods for accessing the + * various components of an instance, and methods for normalizing, resolving, + * and relativizing IRI instances. Instances of this class are immutable. + * For convenience, a lightweight {@linkplain Builder builder} is also provided. + * + *

URI/IRI syntax and components

+ * + * At the highest level a URI (or IRI) reference (hereinafter simply "URI") in string + * form has the syntax + * + *
+ * [scheme{@code :}][{@code //}authority]path[{@code ?}query][{@code #}fragment] + *
+ * + * where square brackets [...] delineate optional components and the characters + * {@code :}, {@code /}, {@code ?} and {@code #} + * stand for themselves. + * + *

Some examples of URIs and their component parts are: + *

+ * http://example.com:8042/over/there?name=ferret#nose
+ * \_/    \______________/\_________/ \________/  \__/
+ *  |             |            |          |         |
+ * scheme        authority    path       query     fragment
+ *  |   _______________________|
+ * / \ /             \
+ * urn:isbn:0439784549
+ * 
+ * + *

A URI is said to be absolute if it specifies a scheme. + * Otherwise it is a relative URI reference. + * + *

The authority component of a URI parses according to the following syntax + * + *

+ * [user-info{@code @}]host[{@code :}port] + *
+ * + * where the characters {@code @} and {@code :} stand for + * themselves. The host component can be an IP-literal (a bracket enclosed + * IPv6address or IPvFuture), an IPv4address, or just a name. + * + *

The path component of a URI can be either hierarchical + * or opaque to hierarchy. An absolute URI that doesn't specify + * any authority component may have a path opaque to hierarchy. + * + *

A hierarchical path component is itself said to be absolute + * if it begins with a slash character ({@code '/'}); otherwise it is + * relative. + * The path of a URI that specifies an authority component, if non empty, + * is always absolute. + * + *

A URI has a path that is opaque to hierarchy if the URI + * has a scheme but no authority component, and its path isn't empty + * and doesn't start with {@code '/'}. + * + *

For simplification this + * document designates URIs with a hierarchical path as + * hierarchical URIs and URIs with a path opaque to hierarchy as + * opaque URIs. + * + *

Some examples of opaque URIs are: + * + *

+ * + *

As stated above, a URI has a path that is hierarchical + * if it has an authority component, or it has no scheme, or its + * path is empty or starts with a {'/'}. + * Some examples of hierarchical URIs are: + * + *

+ * + * (Note that a URI of the form {@code "file:foo"} is considered opaque, + * since it has a scheme and its path isn't empty and does not start + * with {@code '/'}). + * + *

All told, then, an IRI instance has the following eight components: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
Describes the components of a URI:scheme,authority,user-info,host,port,path,query,fragment
ComponentType
scheme{@code String}
authority{@code String}
user-info{@code String}
host{@code String}
port{@code int}
path{@code String}
query{@code String}
fragment{@code String}
+ * + * In a given instance any particular component is either undefined or + * defined with a distinct value. Undefined string components are + * represented by {@code null}, while undefined integer components are + * represented by {@code -1}. A string component may be defined to have the + * empty string as its value; this is not equivalent to that component being + * undefined. For instance, an authority component can either be undefined, + * as in {@code "file:/path"} or defined and empty, as in {@code "file:///path"}, + * or defined and non-empty, as in {@code "http://localhost:8080/path"}. Note + * that if an authority component is defined, then the host component will be + * defined as well (though it may be empty), and conversely. This is in + * difference with {@link java.net.URI java.net.URI} where an empty authority + * component was considered undefined. An other difference with + * {@code java.net.URI} is that the path component as returned by + * {@link #getPath()} or {@link #getRawPath()} is never {@code null}. + * + * + *

Operations on IRI instances

+ * + * The key operations supported by this class are those of + * normalization, resolution, and relativization. + * These operations are implemented in a manner which is consistent with + * RFC 3986, and therefore their behavior may be different than + * what was implemented in {@link java.net.URI java.net.URI}. + * + *

Normalization is the process of removing unnecessary {@code "."} + * and {@code ".."} segments from the path component of a hierarchical URI. + * Each {@code "."} segment is simply removed. A {@code ".."} segment is + * removed only if it is preceded by a non-{@code ".."} segment, unless the + * path is absolute and the preceding segment is the root of the path + * ({@code "/"}). + * If the path is absolute, it will contain no {@code ".."} segment after + * normalization. If the path is relative, it may contain leading + * {@code ".."} segments.
+ * Normalization has no effect upon opaque URIs. + * + *

Resolution is the process of resolving one URI against another + * base URI. The resulting URI is constructed from components of both + * URIs in the manner specified by RFC 3986, taking components from the + * base URI for those not specified in the original. For hierarchical URIs, + * the path of the original is resolved against the path of the base and then + * normalized. The result, for example, of resolving + * + *

+ * {@code sample/a/index.html#28} + *              + *     (1) + *
+ * + * against the base URI {@code http://example.com/languages/java/} is the result + * URI + * + *
+ * {@code http://example.com/languages/java/sample/a/index.html#28} + *
+ * + * Resolving the relative URI + * + *
+ * {@code ../../demo/b/index.html}    (2) + *
+ * + * against this result yields, in turn, + * + *
+ * {@code http://example.com/languages/java/demo/b/index.html} + *
+ * + * Resolution of both absolute and relative URIs, and of both absolute and + * relative paths in the case of hierarchical URIs, is supported. Resolving + * the URI {@code file:///~calendar} against any other URI simply yields the + * original URI, since it is absolute. Resolving the relative URI (2) above + * against the relative base URI (1) yields the normalized, but still relative, + * URI: + * + *
+ * {@code demo/b/index.html} + *
+ * + *

Relativization, finally, is the inverse of resolution: For any + * two normalized hierarchical URIs u and v, + * + *

+ * u{@code .relativize(}u{@code .resolve(}v{@code )).equals(}v{@code )}  and
+ * u{@code .resolve(}u{@code .relativize(}v{@code )).equals(}v{@code )}  .
+ *
+ * + * Note that the assertions above are only guaranteed to hold + * if v is absolute, or the normalised path of v doesn't + * start with {@code ../}, isn't {@code ..}, and isn't empty or absolute. + * The general intent, corner cases excluded, is that the first assertion + * will hold if v is a well formed relative URI with a non empty path + * referencing some sub-path of u, and the second will hold if + * if v is some absolute URI whose scheme and authority match + * those of u and whose path is referencing some sub-path of + * u. + * + *

This operation is often useful when constructing a document containing URIs + * that must be made relative to the base URI of the document wherever + * possible. For example, relativizing the URI + * + *

+ * {@code http://example.com/languages/java/sample/a/index.html#28} + *
+ * + * against the base URI + * + *
+ * {@code http://example.com/languages/java/} + *
+ * + * yields the relative URI {@code sample/a/index.html#28}. + * + * + *

Character categories

+ * + * RFC 3986, combined with RFC 3987, specifies precisely which + * characters are permitted in the various components of a URI reference. + * The following categories, most of which are taken from that specification, + * are used below to describe these constraints: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
Describes categories alpha, digit, alphanum, unreserved, + * sub-delims, gen-delims, reserved, percent-encoded, private and other
CategoryDescription
alphaThe US-ASCII alphabetic characters, + * {@code 'A'} through {@code 'Z'} + * and {@code 'a'} through {@code 'z'}
digitThe US-ASCII decimal digit characters, + * {@code '0'} through {@code '9'}
alphanumAll alpha and digit characters
unreservedAll alphanum characters together with those in the string + * {@code "_-.~"}
sub-delimsThe characters in the string {@code "!$&'()*+,;="}
gen-delimsThe characters in the string {@code ":/?#[]@"}
reservedAll sub-delims characters and all + * gen-delims characters
percent-encodedPercent-encoded octets, that is, triplets consisting of the percent + * character ({@code '%'}) followed by two hexadecimal digits + * ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and + * {@code 'a'}-{@code 'f'})
privateUnicode characters in the range + * {@code U+E000-U+F8FF / U+F0000-U+FFFFD / U+100000-U+10FFFD}. + * These characters are defined in the iprivate + * rule of RFC 3987, and correspond to + * + * Private Use Code Points in the Unicode Character Set.
otherThe Unicode characters that are not in the US-ASCII character set, + * are not control characters (according to the {@link + * Character#isISOControl(char) Character.isISOControl} + * method), are not space characters (according to the {@link + * Character#isSpaceChar(char) Character.isSpaceChar} method), are + * not special characters in the range {@code U+FFF0}-{@code U+FFFD}, + * are not + * non-characters, and are not + * + * private use characters. + *
+ * + *

The set of all legal URI characters consists of + * the unreserved, reserved, percent-encoded, private + * and other characters. + * + *

Deviation from RFC 3987: This implementation + * considers non-US-ASCII space characters as defined by + * {@link Character#isSpaceChar(char) Character.isSpaceChar} as illegal + * and will quote them.

+ * + *

Percent-encoded octets, quotation, encoding, and decoding

+ * + * RFC 3986 allows percent-encoded octets to appear in the user-info, host, path, + * query, and fragment components. Percent-encoding serves two purposes in URIs: + * + * + * + * These purposes are served in this class by three related operations: + * + * + * + * These operations are exposed in the methods of this class as follows: + * + * + * + *

Note that these transformations also ensure that + * u{@code .equals(IRI.parseIRI(}u{@code .toIRIString()))} is + * always true.

+ * + *

Identities

+ * + * For any IRI u, it is always the case that + * + *
+ * {@code IRI.parseIRI(}u{@code .toString()).equals(}u{@code )} . + *
+ * and + *
+ *     u.toBuilder().build().equals(u)
+ * + * In addition, + *
+ *     IRI.createOpaque(u.getScheme(),
+ *             u.getRawPath(),
+ *             u.getRawQuery(),
+ *             u.getRawFragment())
+ *     .equals(u)
+ * if u has a scheme, no authority, and its path doesn't start + * with {@code '/'} and is not empty (in other words, if it is + * opaque to hierarchy), + *
+ *     IRI.createHierarchical(u.getScheme(),
+ *             u.getRawAuthority(),
+ *             u.getRawPath(),
+ *             u.getRawQuery(),
+ *             u.getRawFragment())
+ *     .equals(u)
+ * and + *
+ *     IRI.createHierarchical(u.getScheme(),
+ *             u.getRawUserInfo(), u.getRawHostString(), u.getPort(),
+ *             u.getRawPath(), u.getRawQuery(),
+ *             u.getRawFragment())
+ *     .equals(u)
+ * if u is hierarchical. + * + * For any IRI u that does not contain double encoding syntax such + * that one of the decoded components contains a percent-encoded triplet + * sequence corresponding to a legal character for that component (such as e.g. + * {@code "%E2%82%AC"} which corresponds to the UTF-8 encoding of the Euro + * currency symbol {@code "\u20ac"}), the following identities also hold: + *
+ *     IRI.createOpaque(u.getScheme(),
+ *             u.getPath(),
+ *             u.getQuery(),
+ *             u.getFragment())
+ *     .equals(u)
+ * if u has a scheme, no authority, and its path doesn't start + * with {@code '/'} and is not empty (in other words, if it is + * opaque to hierarchy), + *
+ *     IRI.createHierarchical(u.getScheme(),
+ *             u.getAuthority(),
+ *             u.getPath(),
+ *             u.getQuery(),
+ *             u.getFragment())
+ *     .equals(u)
+ * and + *
+ *     IRI.createHierarchical(u.getScheme(),
+ *             u.getUserInfo(), u.getHostString(), u.getPort(),
+ *             u.getPath(), u.getQuery(),
+ *             u.getFragment())
+ *     .equals(u)
+ * if u is hierarchical. + * + * + *

Internationalized Resource Identifiers (IRIs)

+ * + * An instance of this class represents an IRI whenever it contains non-US-ASCII, + * other, or private characters. The following methods perform the + * operations and conversions as described in RFC 3987: + * + * + * + * + *

IRIs, URIs, URLs, and URNs

+ * + * An IRI is an Internationalized Resource Identifier which contains + * characters from the Universal Character Set (Unicode/ISO 10646), + * which encompass the US-ASCII character set. Hence every URI is an IRI, + * abstractly speaking, and every IRI can be mapped to a URI string + * conforming to RFC 3986. Instances of the + * {@link java.net.URI java.net.URI} class represent a Uniform Resource + * Identifier (URI) reference as defined by the obsolete RFC 2396. + * That specification has been superseded by RFCs 3986 and 3987 which + * define URIs and IRIs, respectively. Both of these entities are now + * represented by {@link IRI java.net.IRI}, which should be preferred + * over {@code java.net.URI}. Because {@link java.net.URI java.net.URI} + * is not fully compliant with RFC 3986 or RFC 3987, but + * is based on RFC 2396, with deviations that brings it close to + * RFC 3986 and RFC 3987, most instances of {@code java.net.IRI} + * can be converted to instances of {@code java.net.URI}, to the exception + * of some corner case URIs such as {@code "about:"}, {@code "http://"}, + * or {@code "//"}, that {@code java.net.URI} is unable to parse. + * + *

In addition, a URI or IRI is a uniform resource identifier + * while a URL is a uniform resource locator. + * Hence every URL is a URI, abstractly speaking, but + * not every URI is a URL. This is because there is another subcategory of + * URIs, uniform resource names (URNs), which name resources but do not + * specify how to locate them. The {@code mailto}, {@code news}, and + * {@code isbn} URIs shown above are examples of URNs. + * + *

The conceptual distinction between URIs and URLs is reflected in the + * differences between this class and the {@link URL} class. + * + *

An instance of this class represents a URI reference in the syntactic + * sense defined by RFC 3986 and RFC 3987. + * A URI may be either absolute or relative. + * A URI string is parsed according to the generic syntax without regard to the + * scheme, if any, that it specifies. No lookup of the host, if any, is + * performed, and no scheme-dependent stream handler is constructed. Equality, + * hashing, and comparison are defined strictly in terms of the character + * content of the instance. In other words, a URI instance is little more than + * a structured string that supports the syntactic, scheme-independent + * operations of comparison, normalization, resolution, and relativization. + * + *

An instance of the {@link URL} class, by contrast, represents the + * syntactic components of a URL together with some of the information required + * to access the resource that it describes. A URL must be absolute, that is, + * it must always specify a scheme. A URL string is parsed according to its + * scheme. A stream handler is always established for a URL, and in fact it is + * impossible to create a URL instance for a scheme for which no handler is + * available. Equality and hashing depend upon both the scheme and the + * Internet address of the host, if any; comparison is not defined. In other + * words, a URL is a structured string that supports the syntactic operation of + * resolution as well as the network I/O operations of looking up the host and + * opening a connection to the specified resource. Applications should + * refrain from creating instances of {@code java.net.URL} directly, and should + * preferably use {@link IRI#toURL() IRI.toURL()} to do so. + * + *

RFC 3986 and RFC 2396: compatibility with java.net.URI

+ * + *

In RFC 2396, + * i.e. the previous specification of URI syntax, such as supported by + * {@link java.net.URI java.net.URI}, a URI had the syntax + * + *

+ * [scheme{@code :}]scheme-specific-part[{@code #}fragment] + *
+ * + * at the highest level. URIs were also classified according to whether they + * were opaque or hierarchical. + * An opaque URI was defined as an absolute URI whose scheme-specific + * part does not begin with a slash character ({@code '/'}). Opaque URIs were + * not subject to further parsing. + * + *

As explained earlier in this document, in + * RFC 3986 this + * terminology has been replaced by allowing a URI to have a path component + * which may be opaque to hierarchy. + * + *

These differences may seem innocuous but they do have an impact + * on behavioral differences between {@link java.net.URI java.net.URI} + * and this class. For instance, an opaque URI as parsed by {@code + * java.net.URI} will have a {@code null} path component, whereas as parsed + * by this class, it will simply have an opaque path component. + * Similarly, an opaque URI as modelled with {@code java.net.URI} has + * no query component: the query component is included in the opaque + * scheme-specific-part of the URI. With {@code java.net.IRI}, an + * opaque URI may now have a query component. + * + *

The raw scheme-specific-part of a {@code java.net.URI} + * may be reconstructed by concatenating the raw authority, path, and query + * components of a corresponding IRI instance using the following rule: + * {@code ["//" authority] path ["?" query]} + * where parts enclosed in brackets are omitted when their corresponding + * raw component is null. + * + *

Another difference brought by + * RFC 3986 + * is in the parsing of the authority component. RFC 2396 used to make + * the distinction between a host name and reg_name. + * RFC 3986 does not. + * The consequence is that with {@code java.net.URI}, a host name that + * did not strictly abide with the syntax for DNS names was parsed as + * a reg_name, and {@link URI#getHost() java.net.URI::getHost} + * would return {@code null}. Not so for {@code java.net.IRI} where + * the host is the reg-name. For instance, in + * {@code "http://x_y.z.com:80/example"}, {@code java.net.URI} would + * report a {@code null} host, while {@code java.net.IRI} will parse + * the host string as {@code "x_y.z.com"}. + * As a consequence the {@linkplain #getRawHostString() + * raw host} component, being a potential reg-name, is no longer + * guaranteed to be either a + * syntactically valid IP-literal address or to conform to the DNS syntax. + * To prevent misinterpretation when the base {@link ResourceIdentifier} + * class is used, this class provides a {@link #getHost()} method that + * behaves as {@link URI#getHost()}. That is, {@link #getHost()} + * will return null if the decoded host component does not syntactically + * correspond to an IPv4 literal address, and IPv6 literal address, or a + * reg-name that conforms to the DNS syntax. + * APIs that need to deal with internationalized host names are encouraged + * to use the uninterpreted {@linkplain #getRawHostString() raw host string} + * or {@linkplain #getHostString() decoded host string} as they see fit, and can + * use the {@link #getHostType(String)} method to make informed decisions + * on how to handle that host part of the authority component. + * + *

Finally, there are also other differences in behaviors for the + * {@code normalize} and {@code resolve} methods, as well as in + * the corresponding {@code relativize} method.

+ * + *

Note that above definitions, e.g. scheme-specific-part component and + * opaque vs. hierarchical, are all subject to RFC 2396. They're deprecated + * in RFC 3986. + * + * @apiNote + * + * Applications working with file paths and file URIs should take great + * care to use the appropriate methods to convert between the two. + * The {@link Path#of(URI)} factory method and the {@link File#File(URI)} + * constructor can be used to create {@link Path} or {@link File} + * objects from a file URI. {@link Path#toUri()} and {@link File#toURI()} + * can be used to create a {@link URI} from a file path, which can + * be converted to an IRI for further manipulation using {@link + * IRI#of(ResourceIdentifier)}. + * Applications should never try to {@linkplain + * #createHierarchical(String, String, String, String) + * construct}, {@linkplain #parseIRI(String) parse}, or + * {@linkplain #resolve(String) resolve} an {@code IRI} + * from the direct string representation of a {@code File} or {@code Path} + * instance. + *

+ * Some components of a URI or IRI, such as userinfo, may + * be abused to construct misleading URLs or URIs. Applications + * that deal with URIs or IRIs should take into account + * the recommendations advised in RFC3986, + * Section 7, Security Considerations. + * + * @see RFC 3986: Uniform + * Resource Identifier (URI): Generic Syntax,
RFC 3987: Internationalized + * Resource Identifiers (IRIs),
RFC 3513: Internet + * Protocol Version 6 (IPv6) Addressing Architecture,
URISyntaxException + */ + +public final class IRI extends ResourceIdentifier + implements Comparable, Serializable +{ + + // Note: Comments containing the word "ASSERT" indicate places where a + // throw of an InternalError should be replaced by an appropriate assertion + // statement once asserts are enabled in the build. + + static final long serialVersionUID = -6052424284110960213L; + + + // -- Properties and components of this instance -- + + // Components of all URIs: [:][#] + private final transient String scheme; // null ==> relative URI + private final transient String fragment; + + // Hierarchical URI components: [//][?] + private final transient String authority; // opaque => null authority + + // Authority: [@][][:] + private final transient String userInfo; // null if absent + private final transient String host; // null if authority is null + private final transient int port; // -1 ==> undefined + + // Remaining components of hierarchical URIs + private final transient String path; + private final transient String query; + + // The remaining fields may be computed on demand, which is safe even in + // the face of multiple threads racing to initialize them + private transient int hash; // Zero ==> undefined + + private transient String decodedUserInfo; + private transient String decodedAuthority; + private transient String decodedHost; + private transient String decodedPath; + private transient String decodedQuery; + private transient String decodedFragment; + + /** + * The string form of this URI. + * + * @serial + */ + private volatile String string; // The only serializable field + + // The IRI string form of this URI, i.e. there are no + // unnecessarily percent-encoded characters in it according to RFC 3987 + private volatile transient String iriString; + + + // -- Constructors and factories -- + + private IRI(String input, + String scheme, + String authority, + String userInfo, + String host, + int port, + String path, + String query, + String fragment) { + assert host != null || authority == null; + assert path != null; + this.string = input; + this.scheme = scheme; + this.authority = authority; + this.userInfo = userInfo; + this.host = host; + this.port = port; + this.path = path; + this.query = query; + this.fragment = fragment; + } + + + /** + * Creates an IRI by parsing the given string. + * + *

This method parses the given string exactly as specified by the + * grammar in RFC 3987, + * Section 2.2. ABNF for IRI References and IRIs

+ * + * @param encodedString The string to be parsed into an IRI + * @return the IRI + * + * @throws NullPointerException + * If {@code str} is {@code null} + * + * @throws URISyntaxException + * If the given string violates RFC 3986 or RFC 3987 + */ + public static IRI parseIRI(String encodedString) + throws URISyntaxException + { + return new Parser(encodedString).parse(true); + } + + /** + * Creates an IRI by leniently parsing the given string. + * + *

This method implements a lenient parsing of the + * given IRI string, which will first percent-encode the printable + * characters in US-ASCII that are not allowed in URIs, but which + * are sometime used in queries (such as the pipe ({@code '|'}) + * character), as specified by the {@linkplain #quoteLenient(String)} + * method. The resulting string is then parsed as if by calling + * {@link IRI#parseIRI(String)}.

+ * + * @implSpec Calling this method is equivalent to calling {@code + * IRI.parseIRI(IRI.quoteLenient(iriString))}. + * + * @param iriString The string to be parsed into an IRI + * @return the IRI + * + * @throws NullPointerException + * If {@code iriString} is {@code null} + * + * @throws URISyntaxException + * If the given string, after percent-encoding the + * printable characters in US-ASCII that are not + * allowed in URIs, still violates RFC 3986 or + * RFC 3987 + * + * @see #quoteLenient(String) + * @see #unquoteLenient(String) + * @see #toLenientString() + * + */ + public static IRI parseLenient(String iriString) + throws URISyntaxException + { + return new Parser(quote(iriString, L_LENIENT, H_LENIENT, NonASCII.NOQUOTES)) + .parse(true); + } + + /** + * Creates a hierarchical IRI from the given components. + * + *

If a scheme is given then the path, if also given, must either be + * empty or begin with a slash character ({@code '/'}). Otherwise a + * component of the new URI may be left undefined by passing {@code null} + * for the corresponding parameter or, in the case of the {@code port} + * parameter, by passing {@code -1}. + * + *

This method first builds a URI string from the given components + * according to the rules specified in RFC 3986, + * section 5.3:

+ * + *
    + * + *
  1. Initially, the result string is empty.

  2. + * + *
  3. If a scheme is given then it is appended to the result, + * followed by a colon character ({@code ':'}).

  4. + * + *
  5. If user information, a host, or a port are given then the + * string {@code "//"} is appended.

  6. + * + *
  7. If user information is given then it is appended, followed by + * a commercial-at character ({@code '@'}). Any character not in the + * unreserved, sub-delims, percent-encoded, + * or other categories, and not equal to the colon character + * ({@code ':'}), is quoted.

  8. + * + *
  9. If a host is given then it is appended. If the host is a + * literal IPv6 address but is not enclosed in square brackets + * ({@code '['} and {@code ']'}) then the square brackets are added. + * Otherwise if the host is a reg-name, then any character not in the + * unreserved, sub-delims or percent-encoded + * or other categories is quoted. + *

  10. + * + *
  11. If a port number is given then a colon character + * ({@code ':'}) is appended, followed by the port number in decimal. + *

  12. + * + *
  13. If a path is given then it is appended. Any character not in + * the unreserved, sub-delims, percent-encoded, + * or other categories, and not equal to the slash character + * ({@code '/'}), the colon character ({@code ':'}), or the + * commercial-at character ({@code '@'}), is quoted.

  14. + * + *
  15. If a query is given then a question-mark character + * ({@code '?'}) is appended, followed by the query. Any character not in + * the unreserved, sub-delims, percent-encoded, + * other or private categories, and not equal to the slash + * character ({@code '/'}), the colon character ({@code ':'}), the + * commercial-at character ({@code '@'}) or the question-mark + * character ({@code '?')}, is quoted. + *

  16. + * + *
  17. Finally, if a fragment is given then a hash character + * ({@code '#'}) is appended, followed by the fragment. Any character not in + * the unreserved, sub-delims, percent-encoded, + * or other categories, and not equal to the slash + * character ({@code '/'}), the colon character ({@code ':'}), the + * commercial-at character ({@code '@'}) or the question-mark + * character ({@code '?')}, is quoted.

  18. + * + *
+ * + * @param scheme Scheme name + * @param userInfo User name and authorization information + * @param host Host name + * @param port Port number + * @param path Path + * @param query Query + * @param fragment Fragment + * @return The new IRI + * + * @throws URISyntaxException + * If the URI string constructed from the given components violates + * RFC 3986 or RFC 3987 + * @throws IllegalArgumentException If the given arguments are not + * consistent. For instance, + * if both a scheme and a path are given but the path is relative; + * or if {@code host} is {@code null} but {@code userInfo} is not + * {@code null} or port is not {@code -1}; or if {@code host} is + * {@code null} and the {@code path} component starts with + * {@code "//"} (see RFC 3986 section 3), or contains + * {@code ':'} before the first {@code '/'}; or if {@code host} + * is not {@code null}, {@code scheme} is {@code null}, {@code path} + * is not empty and does not start with {@code '/'}. + * + */ + public static IRI createHierarchical(String scheme, + String userInfo, + String host, + int port, + String path, + String query, + String fragment) + throws URISyntaxException + { + if (host == null && (userInfo != null || port != -1)) { + throw new IllegalArgumentException( + "host can not be null when authority is present (userinfo or port are defined)"); + } + path = checkHierarchicalPath(scheme, null, userInfo, host, port, path, true); + String s = toString(scheme, + null, userInfo, host, port, + path, query, fragment); + return new Parser(s).parse(true); + } + + /** + * Creates a hierarchical IRI from the given components. + * + *

If a scheme is given then the path, if also given, must either be + * empty or begin with a slash character ({@code '/'}). Otherwise a + * component of the new IRI may be left undefined by passing {@code null} + * for the corresponding parameter. + * + *

This method first builds an IRI string from the given components + * according to the rules specified in RFC 3986, + * section 5.3:

+ * + *
    + * + *
  1. Initially, the result string is empty.

  2. + * + *
  3. If a scheme is given then it is appended to the result, + * followed by a colon character ({@code ':'}).

  4. + * + *
  5. If an authority is given then the string {@code "//"} is + * appended, followed by the authority. If the authority contains a + * literal IPv6 address then the address must be enclosed in square + * brackets ({@code '['} and {@code ']'}). Any character not in the + * unreserved, sub-delims, percent-encoded, + * or other categories, and not equal to the commercial-at character + * ({@code '@'}) or the colon character ({@code ':'}), + * is quoted.

  6. + * + *
  7. If a path is given then it is appended. Any character not in + * the unreserved, sub-delims, percent-encoded, + * or other categories, and not equal to the slash character + * ({@code '/'}), the colon character ({@code ':'}), or the + * commercial-at character ({@code '@'}), is quoted.

  8. + * + *
  9. If a query is given then a question-mark character + * ({@code '?'}) is appended, followed by the query. Any character not in + * the unreserved, sub-delims, percent-encoded, + * other or private categories, and not equal to the slash + * character ({@code '/'}), the colon character ({@code ':'}), the + * commercial-at character ({@code '@'}) or the question-mark + * character ({@code '?')}, is quoted. + *

  10. + * + *
  11. Finally, if a fragment is given then a hash character + * ({@code '#'}) is appended, followed by the fragment. Any character not in + * the unreserved, sub-delims, percent-encoded, + * or other categories, and not equal to the slash + * character ({@code '/'}), the colon character ({@code ':'}), the + * commercial-at character ({@code '@'}) or the question-mark + * character ({@code '?')}, is quoted.

  12. + * + *
+ * + * @param scheme Scheme name + * @param authority Authority + * @param path Path + * @param query Query + * @param fragment Fragment + * @return The new IRI + * + * @throws URISyntaxException + * If the URI string constructed from the given components violates + * RFC 3986 or RFC 3987 + * @throws IllegalArgumentException if the given arguments are not + * consistent. For instance, + * if both a scheme and a path are given but the path is relative; + * or if no scheme and authority are provided and {@code path} starts with + * {@code "//"} or contains {@code ':'} before the first {@code '/'}; + * or if {@code host} is not {@code null}, {@code scheme} is null, + * {@code path} is not empty and does not start with {@code '/'}. + * + */ + public static IRI createHierarchical(String scheme, + String authority, + String path, + String query, + String fragment) + throws URISyntaxException + { + path = checkHierarchicalPath(scheme, authority, null, null, -1, path, true); + String s = toString(scheme, + authority, null, null, -1, + path, query, fragment); + return new Parser(s).parse(true); + } + + /** + * Creates a hierarchical IRI from the given components. + * + *

A component may be left undefined by passing {@code null}. + * + *

This convenience factory method works as if by invoking the + * seven-argument factory method as follows: + * + *

+ * {@code new} {@link IRI#createHierarchical(String,String,String,int,String,String,String) + * IRI.createHierarchical}{@code (scheme, null, host, port, path, null, fragment);} + *
+ * + * @param scheme Scheme name + * @param host Host name + * @param port Port number + * @param path Path + * @param fragment Fragment + * @return The new IRI + * + * @throws URISyntaxException + * If the URI string constructed from the given components + * violates RFC 3986 or RFC 3987. + */ + public static IRI createHierarchical(String scheme, + String host, + int port, + String path, + String fragment) + throws URISyntaxException + { + return createHierarchical(scheme, null, host, port, path, null, fragment); + } + + /** + * Creates a hierarchical IRI from the given components. + * + *

A component may be left undefined by passing {@code null}. + * + *

This convenience factory method works as if by invoking the + * seven-argument factory method as follows: + * + *

+ * {@code new} {@link IRI#createHierarchical(String,String,String,int,String,String,String) + * IRI.createHierarchical}{@code (scheme, null, host, -1, path, null, fragment);} + *
+ * + * @param scheme Scheme name + * @param host Host name + * @param path Path + * @param fragment Fragment + * @return The new IRI + * + * @throws URISyntaxException + * If the URI string constructed from the given components + * violates RFC 3986 or RFC 3987 + */ + public static IRI createHierarchical(String scheme, + String host, + String path, + String fragment) + throws URISyntaxException + { + return createHierarchical(scheme, null, host, -1, path, null, fragment); + } + + /** + * Creates an opaque IRI from the given components. + * + *

A component may be left undefined by passing {@code null}. + * + *

This method first builds a URI in string form using the given + * components as follows:

+ * + *
    + * + *
  1. Initially, the result string is empty.

  2. + * + *
  3. If a scheme is given then it is appended to the result, + * followed by a colon character ({@code ':'}).

  4. + * + *
  5. If an opaque path is given then it is appended. Any character + * not in he unreserved, sub-delims, percent-encoded, + * or other categories, and not equal to the slash character + * ({@code '/'}), the colon character ({@code ':'}), or the + * commercial-at character ({@code '@'}), is quoted.

  6. + * + *
  7. If a query is given then a question-mark character + * ({@code '?'}) is appended, followed by the query. Any character not in + * the unreserved, sub-delims, percent-encoded, + * other or private categories, and not equal to the slash + * character ({@code '/'}), the colon character ({@code ':'}), the + * commercial-at character ({@code '@'}) or the question-mark + * character ({@code '?')}, is quoted. + *

  8. + * + *
  9. Finally, if a fragment is given then a hash character + * ({@code '#'}) is appended, followed by the fragment. Any character not in + * the unreserved, sub-delims, percent-encoded, + * or other categories, and not equal to the slash + * character ({@code '/'}), the colon character ({@code ':'}), the + * commercial-at character ({@code '@'}) or the question-mark + * character ({@code '?')}, is quoted.

  10. + * + *
+ * + *

The resulting URI string is then parsed in order to create the new + * IRI instance as if by invoking the {@link IRI#parseIRI(String)} + * factory method; this may cause a {@link URISyntaxException} to be thrown. + * + * @param scheme Scheme name + * @param opaque Opaque path + * @param query Query + * @param fragment Fragment + * @return The new IRI + * + * @throws IllegalArgumentException if {@code scheme} is {@code null}. + * @throws URISyntaxException + * If the URI string constructed from the given components + * violates RFC 3986 or RFC 3987, or if its path + * is not opaque to hierarchy. + */ + public static IRI createOpaque(String scheme, String opaque, String query, String fragment) + throws URISyntaxException + { + if (scheme == null) { + throw new IllegalArgumentException("A scheme is required to build an opaque URI"); + } + IRI iri = new Parser(toString(scheme, null, null, null, + -1, opaque, query, fragment)) + .parse(true); + if (!iri.isOpaque()) { + throw new URISyntaxException(iri.toString(), "URI is not opaque"); + } + return iri; + } + + /** + * Converts the given string into an {@code IRI}. + * + *

This convenience factory method works as if by invoking the + * {@linkplain IRI#parseIRI(String) parseIRI factory}; + * any {@link URISyntaxException} thrown by {@code parseIRI} + * is caught and wrapped in a new {@link IllegalArgumentException} object, + * which is then thrown. + * + *

This method is provided for use in situations where it is known that + * the given string is a legal URI, for example with constant IRI literals + * declared within a program, and so it would be considered a programming + * error for the string not to parse as such. The factory methods, which + * throw {@link URISyntaxException} directly, should be used in situations + * where a URI is being created from user input or from some other source + * that may be prone to errors. + * + * @param str The string to be parsed into an {@code IRI} + * @return The new {@code IRI} + * + * @throws NullPointerException + * If {@code str} is {@code null} + * + * @throws IllegalArgumentException + * If the given string violates RFC 3986 or RFC 3987 + */ + public static IRI of(String str) { + try { + return parseIRI(str); + } catch (URISyntaxException x) { + throw new IllegalArgumentException(x.getMessage(), x); + } + } + + + /** + * Converts the given {@code ResourceIdentifier} into an {@code IRI}. + * + *

If the given {@link ResourceIdentifier} {@code ri} is an + * {@code IRI}, returns {@code ri}. Otherwise, returns a new {@code IRI} + * constructed as if by calling {@link #of(String) + * IRI.of(ri.toString())}.

+ * + * @apiNote + *

This method is provided for use in situations where an API + * accepts an abstract {@link ResourceIdentifier} as input, on + * the condition that it must be convertible to a concrete instance + * of {@code IRI} for further processing.

+ * + * @param ri The {@code ResourceIdentifier} to be converted into + * an {@code IRI} + * @return An {@code IRI} converted from the given {@code ResourceIdentifier} + * + * @throws NullPointerException + * If {@code ri} is {@code null} + * + * @throws IllegalArgumentException + * If the given {@code ResourceIdentifier} violates RFC 3986 + * or RFC 3987 and cannot be converted to an {@code IRI} + */ + public static IRI of(ResourceIdentifier ri) { + return (ri instanceof IRI) ? (IRI)ri : IRI.of(ri.toString()); + } + + + + // -- Operations -- + + /** + * Normalizes this URI's path. + * + *

If this URI is opaque, or if its path is already in normal form, + * and its authority is either {@code null} or doesn't end with {@code ':'}, + * then this URI is returned. Otherwise a new URI is constructed that is + * identical to this URI except that its authority is normalized by removing + * the superfluous colon, if any, and its path is computed by normalizing + * this URI's path in a manner consistent with RFC 3986, + * section 5.2.4; that is: + *

+ * + *
    + * + *
  1. All {@code "."} segments are removed.

  2. + * + *
  3. If a {@code ".."} segment is preceded by a non-{@code ".."} + * segment then both of these segments are removed. This step is + * repeated until it is no longer applicable.

  4. + * + *
  5. If the path is relative, and if its first segment contains a + * colon character ({@code ':'}), then a {@code "."} segment is + * prepended. This prevents a relative URI with a path such as + * {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a + * scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}. + * (Deviation from RFC 3986)

  6. + * + *
+ * + *

A normalized path will begin with one or more {@code ".."} segments + * if the URI is relative and there were insufficient non-{@code ".."} + * segments preceding them to allow their removal. + * A normalized path will begin with a {@code "."} + * segment if one was inserted by step 3 above. Otherwise, a normalized + * path will not contain any {@code "."} or {@code ".."} segments.

+ * + * @return An IRI equivalent to this IRI, + * but whose path and authority are in normal form + */ + public IRI normalize() { + return normalize(this); + } + + /** + * Resolves the given IRI against this IRI. + * + *

If this IRI or the given IRI are not + * hierarchical, then the given IRI is returned.

+ * + *

Otherwise this method constructs a new hierarchical URI in a manner + * consistent with RFC 3986, + * section 5.2; that is:

+ * + *

If the given URI's scheme is defined, then a URI with + * the normalized given path and with all other components equal to + * those of the given URI is returned. The returned IRI path is + * normalized as if by invoking the {@link #normalize() normalize} + * method.

+ * + *

Otherwise, a new IRI is constructed with this URI's scheme and the given + * URI's fragment component:

+ * + *
    + * + *
  1. If the given URI has an authority component then the new URI's + * authority, path and query are taken from the given URI. The returned + * IRI path is normalized as if by invoking the {@link + * #normalize() normalize} method.

  2. + * + *
  3. Otherwise the new URI's authority component is copied from + * this URI, then its path is computed as follows:

    + * + *
      + * + *
    1. If the given URI's path is absolute then the new URI's path + * is taken from the given URI and is normalized as if by invoking the + * {@link #normalize() normalize} method.

    2. + * + *
    3. Otherwise the given URI's path is relative, and so the new + * URI's path is computed by resolving the path of the given URI + * against the base path of this URI. If this URI has an authority and + * its path is empty then {@code "/"} is taken for the base path. + * If this URI path is {@code ".."} or ends with {@code "/.."} + * then this URI path appended with {@code "/"} is taken for + * the base path. Otherwise the base path is this URI path. + * All but the last segment of the base path are then concatenated, + * if any, with the given URI's path and the result is then normalized + * as if by invoking the {@link #normalize() normalize} method.

    4. + * + *
    + * + *

    and then its query is computed as:

    + * + *
      + * + *
    1. If the given URI's path is not empty then the new URI's + * query component is always taken from the given URI's query.

    2. + * + *
    3. If the given URI's path is empty and query is defined, + * the new URI's query component is taken from the given URI's query. + * Otherwise, it is taken from the base URI's query.

    4. + * + *
    + * + *
  4. + * + *
+ * + *

The result of this method is absolute if, and only if, either this + * IRI is absolute or the given IRI is absolute.

+ * + * @param uri The IRI to be resolved against this IRI + * @return The resulting IRI + * + * @throws NullPointerException + * If {@code uri} is {@code null} + */ + public IRI resolve(IRI uri) { + return resolve(this, uri); + } + + /** + * Constructs a new IRI by parsing the given string and then resolving it + * against this IRI. + * + *

This convenience method works as if invoking it were equivalent to + * evaluating the expression {@link #resolve(IRI) + * resolve}{@code (IRI.}{@link #of(String) of}{@code (str))}.

+ * + * @param str The string to be parsed into an IRI + * @return The resulting IRI + * + * @throws NullPointerException + * If {@code str} is {@code null} + * + * @throws IllegalArgumentException + * If the given string violates RFC 3986 or RFC 3987 + */ + public IRI resolve(String str) { + return resolve(IRI.of(str)); + } + + /** + * Relativizes the given IRI against this IRI. + * + *

The relativization of the given IRI against this IRI is computed as + * follows:

+ * + *
    + * + *
  1. If either this IRI or the given IRI are non hierarchical, + * or if the scheme and authority components of the two IRIs are not + * identical, or if the path of this IRI is not a prefix of the path + * of the given IRI, then the given IRI is returned.

  2. + * + *
  3. Otherwise a new relative hierarchical IRI is constructed with + * query and fragment components taken from the given IRI and with a path + * component computed by removing this URI's path from the beginning of + * the given URI's path.

  4. + * + *
  5. If the resulting URI is relative, has no authority component, + * and if the first segment of its path contains a + * colon character ({@code ':'}), then a {@code "."} segment is + * prepended. This prevents a relative URI with a path such as + * {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a + * scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}. + *

  6. + * + *
+ * + * @apiNote + * + * In more general terms, if both URIs are hierarchical, their scheme and + * authority components are identical, the normalized path of this URI + * ( the base URI ) is a prefix of the normalized path of the given URI, + * then this method returns a relative-URI that, when resolved against + * the base URI, yields the normalized form of the given URI. Otherwise, + * it returns the given URI. + * + * @param uri The IRI to be relativized against this IRI + * @return The resulting IRI + * + * @throws NullPointerException + * If {@code uri} is {@code null} + */ + public IRI relativize(IRI uri) { + return relativize(this, uri); + } + + /** + * Constructs a URL from this IRI. + * + *

This convenience method works as if invoking it were equivalent to + * evaluating the expression {@code new URL(this.toString())} after + * first checking that this IRI is absolute.

+ * + * @return A URL constructed from this IRI + * + * @throws IllegalArgumentException + * If this URL is not absolute + * + * @throws MalformedURLException + * If a protocol handler for the URL could not be found, + * or if some other error occurred while constructing the URL + */ + public URL toURL() throws MalformedURLException { + return fromURI(this); + } + + /** + * Creates a URL from an IRI, as if by invoking {@code iri.toURL()}. + * + */ + static URL fromURI(IRI uri) throws MalformedURLException { + if (!uri.isAbsolute()) { + throw new IllegalArgumentException("URI is not absolute"); + } + String protocol = uri.getScheme(); + + // In general we need to go via Handler.parseURL, but for the jrt + // protocol we enforce that the Handler is not overrideable and can + // optimize URI to URL conversion. + // + // Case-sensitive comparison for performance; malformed protocols will + // be handled correctly by the slow path. + if (protocol.equals("jrt") && !uri.isOpaque() + && uri.getRawFragment() == null) { + + String query = uri.getRawQuery(); + String path = uri.getRawPath(); + String file = (query == null) ? path : path + "?" + query; + + // URL represent undefined host as empty string while URI use null + String host = uri.getHost(); + if (host == null) { + host = ""; + } + + int port = uri.getPort(); + + return new URL("jrt", host, port, file, null); + } else { + return new URL((URL)null, uri.toString(), null); + } + } + + // -- Component access methods -- + + /** + * Returns the scheme component of this IRI. + * + *

The scheme component of a URI, if defined, only contains characters + * in the alphanum category and in the string {@code "-.+"}. A + * scheme always starts with an alpha character.

+ * + * The scheme component of a URI cannot contain percent-encoded octets, + * hence this method does not perform any decoding. + * + * @return The scheme component of this IRI, + * or {@code null} if the scheme is undefined + */ + @Override + public String getScheme() { + return scheme; + } + + /** + * Tells whether or not this IRI is absolute. + * + *

A URI is absolute if, and only if, it has a scheme component.

+ * + * @return {@code true} if, and only if, this URI is absolute + */ + @Override + public boolean isAbsolute() { + return scheme != null; + } + + /** + * Tells whether or not this IRI has a path which is opaque + * to hierarchy. + * + * @implSpec + *

The concept of opaque URI is defined in RFC 2396. + * In RFC 3986, the definition of such URI has been replaced with + * a better description of how the path component may be opaque to + * hierarchy, i.e. path-rootless rule of RFC 3986. + * This implementation follows the rules of RFC 3986 and + * this method returns {@code true} if this IRI has a scheme, but + * has no authority and its path isn't empty and doesn't + * start with {@code '/'}.

+ * + * @return {@code true} if, and only if, this IRI is has a path + * which is opaque to hierarchy. + */ + @Override + public boolean isOpaque() { + // TODO can path be null now? I don't think so... + return (isAbsolute() && authority == null && + !path.equals("") && !path.startsWith("/")); + } + + /** + * Returns the raw authority component of this IRI. + * + *

The authority component of a URI, if defined, only contains the + * commercial-at character ({@code '@'}) or the colon character + * ({@code ':'}) and characters in the unreserved, sub-delims, + * percent-encoded, and other categories. The authority + * is further constrained to have valid syntax for its + * user-information, host, and port components, if present.

+ * + * @return The raw authority component of this IRI, + * or {@code null} if the authority is undefined + */ + @Override + public String getRawAuthority() { + return authority; + } + + /** + * Returns the decoded authority component of this IRI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawAuthority() getRawAuthority} method except that all + * sequences of percent-encoded octets are decoded + * as specified earlier + * in this document. + *

+ * + * @return The decoded authority component of this IRI, + * or {@code null} if the authority is undefined + */ + @Override + public String getAuthority() { + String auth = decodedAuthority; + if ((auth == null) && (authority != null)) { + decodedAuthority = auth = decode(authority, DecodeInfo.AUTH); + } + return auth; + } + + /** + * Returns the raw user-information component of this IRI. + * + *

The user-information component of a URI, if defined, only contains + * characters in the unreserved, sub-delims, percent-encoded, + * and other categories, or the colon character ({@code ':'}).

+ * + * @return The raw user-information component of this IRI, + * or {@code null} if the user information is undefined + */ + @Override + public String getRawUserInfo() { + return userInfo; + } + + /** + * Returns the decoded user-information component of this IRI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawUserInfo() getRawUserInfo} method except that all + * sequences of percent-encoded octets are decoded + * as specified earlier + * in this document. + *

+ * + * @return The decoded user-information component of this IRI, + * or {@code null} if the user information is undefined + */ + @Override + public String getUserInfo() { + String user = decodedUserInfo; + if ((user == null) && (userInfo != null)) { + decodedUserInfo = user = decode(userInfo, DecodeInfo.USER); + } + return user; + } + + /** + * Returns the raw host component of this IRI. + * + *

The host component of an IRI, if defined, will have one of the + * following forms:

+ * + * + * + * Further information on the exact form of the raw host component of an + * IRI u can be obtained by calling {@link #getHostType(String) + * IRI.getHostType(u.getRawHostString())}. + * + * @return The raw host component of this IRI, + * or {@code null} if the host component is undefined + */ + public String getRawHostString() { + return host; + } + + /** + * Returns the decoded host component of this IRI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawHostString() getRawHostString} method except that all + * sequences of percent-encoded octets are decoded + * as specified earlier + * in this document.

+ * + * Further information on the exact form of the decoded host component of + * an IRI u can be obtained by calling {@link #getHostType(String) + * IRI.getHostType(u.getHostString())}. Note that the raw host component + * and decoded host component might parse as different types if the raw + * host component contains superfluous percent-encoding of US-ASCCI + * characters. + * + * @apiNote + * + * Because RFC 3986 allows the presence of percent-encoded + * characters in the raw host component, the string returned by this method + * can contain any characters, including characters that are not legal + * in a URI, or in a DNS name. Applications are thus encouraged to + * further validate the content of the host string before using it. + * See also the {@link #getHost() IRI.getHost} method. + * + * @return The decoded host component of this IRI, + * or {@code null} if the host component is undefined + */ + public String getHostString() { + String decoded = decodedHost; + if ((decoded == null) && (host != null)) { + decoded = decodedHost = decode(host, DecodeInfo.HOST); + } + return decoded; + } + + /** + * Returns the decoded host component of this IRI, if it + * can be parsed as a syntactically valid IPv4 literal address, + * IPv6 literal address, or a reg-name conforming to the DNS syntax, + * {@code null} otherwise. + * + * @apiNote + * + *

The string returned by this method is either equal to + * that returned by the {@link #getHostString() getHostString} + * method, if {@code IRI.getHostType(getHostString()).isInternetName()} + * yields true, or {@code null}.

+ * + *

This method is provided to avoid misinterpretation of the + * host component when using the base {@linkplain ResourceIdentifier} + * abstraction. Because RFC 3986 allows for the presence of + * percent-encoded triplets in the host component, using the + * {@linkplain #getHostString() decoded host} string directly without + * further validation could be dangerous, as a reg-name, + * once decoded, could contain just any character. + * On the other hand the string returned by the {@code getHost} + * method, if not {@code null}, is guaranteed to be a syntactically + * valid IPv4 literal, bracket-enclosed IPv6 literal, or to be + * a name conforming to the DNS syntax. + * APIs that need to deal with internationalized host + * names are encouraged to make use of the {@link IDN} class + * to encode the host component prior to creating an IRI, or to + * make use of the {@link #getHostString()}, {@link #getRawHostString()} + * and {@link #getHostType(String)} methods in order to figure out + * whether a host component needs to be {@linkplain IDN IDN} encoded + * before being resolved into an internet address.

+ * + * @return The decoded host component of this URI, + * or {@code null} if the decoded host component does not + * parse as an {@linkplain HostType#isInternetName() + * internet name}. + */ + public String getHost() { + String decoded = getHostString(); + return getHostType(decoded).isInternetName() + ? decoded : null; + } + + /** + * Returns the port number of this URI. + * + *

The port component of a URI, if defined, is a non-negative + * integer.

+ * + * @return The port component of this URI, + * or {@code -1} if the port is undefined + */ + @Override + public int getPort() { + return port; + } + + /** + * Returns the raw path component of this URI. + * + *

The path component of a URI, if defined, only contains the slash + * character ({@code '/'}), the commercial-at character ({@code '@'}), + * the colon character ({@code ':'}) and characters in the + * unreserved, sub-delims, percent-encoded, + * and other categories.

+ * + * @return The path component of this URI, + * or {@code null} if the path is undefined + */ + @Override + public String getRawPath() { + return path; + } + + /** + * Returns the decoded path component of this URI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawPath() getRawPath} method except that all sequences of + * percent-encoded octets are decoded as specified + * earlier in this document.

+ * + * @return The decoded path component of this URI, + * or {@code null} if the path is undefined + */ + @Override + public String getPath() { + String decoded = decodedPath; + if (decoded == null) { + decodedPath = decoded = decode(path, DecodeInfo.PATH); + } + return decoded; + } + + /** + * Returns the raw query component of this URI. + * + *

The query component of a URI, if defined, only contains legal URI + * characters.

+ * + * @return The raw query component of this URI, + * or {@code null} if the query is undefined + */ + @Override + public String getRawQuery() { + return query; + } + + /** + * Returns the decoded query component of this URI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawQuery() getRawQuery} method except that all sequences of + * percent-encoded octets are decoded + * as specified earlier + * in this document.

+ * + * @return The decoded query component of this URI, + * or {@code null} if the query is undefined + */ + @Override + public String getQuery() { + String decoded = decodedQuery; + if ((decoded == null) && (query != null)) { + decodedQuery = decoded = decode(query, DecodeInfo.QUERY); + } + return decoded; + } + + /** + * Returns the raw fragment component of this URI. + * + *

The fragment component of a URI, if defined, only contains legal URI + * characters.

+ * + * @return The raw fragment component of this URI, + * or {@code null} if the fragment is undefined + */ + @Override + public String getRawFragment() { + return fragment; + } + + /** + * Returns the decoded fragment component of this URI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawFragment() getRawFragment} method except that all + * sequences of percent-encoded octets are decoded + * as specified earlier + * in this document. + *

+ * + * @return The decoded fragment component of this URI, + * or {@code null} if the fragment is undefined + */ + @Override + public String getFragment() { + String decoded = decodedFragment; + if ((decoded == null) && (fragment != null)) { + decodedFragment = decoded = decode(fragment, DecodeInfo.FRAG); + } + return decoded; + } + + + // -- Equality, comparison, hash code, toString, and serialization -- + + /** + * Tests this URI for equality with another object. + * + *

If the given object is not a URI then this method immediately + * returns {@code false}. + * + *

For two URIs to be considered equal requires that either both are + * opaque or both are hierarchical. Their schemes must either both be + * undefined or else be equal without regard to case. Their fragments + * must either both be undefined or else be equal. + * + *

For two opaque URIs to be considered equal, their scheme-specific + * parts must be equal. + * + *

For two hierarchical URIs to be considered equal, their paths must + * be equal and their queries must either both be undefined or else be + * equal. Their authorities must either both be undefined, or both + * their hosts must be equal without regard to case, their port numbers + * must be equal, and their user-information components must be equal. + * + *

When testing the user-information, path, query, fragment, authority, + * or scheme-specific parts of two URIs for equality, the decoded forms rather + * than the raw forms of these components are compared. + * + *

This method satisfies the general contract of the {@link + * Object#equals(Object) Object.equals} method.

+ * + * @param obj The object to which this object is to be compared + * + * @return {@code true} if, and only if, the given object is a URI that + * is identical to this URI + */ + public boolean equals(Object obj) { + if (obj == this) + return true; + if (!(obj instanceof IRI)) + return false; + + IRI that = (IRI)obj; + this.ensureComponentDecoded(); + that.ensureComponentDecoded(); + + if (this.isOpaque() != that.isOpaque()) return false; + + if (!equalIgnoringCase(this.scheme, that.scheme)) return false; + if (!equal(this.decodedFragment, that.decodedFragment)) return false; + + // Hierarchical or Opaque + if (!equal(this.decodedPath, that.decodedPath)) return false; + if (!equal(this.decodedQuery, that.decodedQuery)) return false; + + // Opaque will stop there as both authorities should be null + if (this.authority == that.authority) return true; + + // Hierarchical + if (this.host != null) { + // Server-based + if (!equal(this.decodedUserInfo, that.decodedUserInfo)) return false; + if (!equalIgnoringCase(this.decodedHost, that.decodedHost)) return false; + if (this.port != that.port) return false; + assert IRI.getHostType(decodedHost) == IRI.getHostType(that.decodedHost); + } else { + assert this.userInfo == null; + assert this.port == -1; + assert this.authority == null; + assert that.authority != null; + // this.authority is null but not that. + return false; + } + + return true; + } + + private void ensureComponentDecoded() { + if ((decodedUserInfo == null) && (userInfo != null)) { + decodedUserInfo = decode(userInfo, DecodeInfo.USER); + } + if ((decodedAuthority == null) && (authority != null)) { + decodedAuthority = decode(authority, DecodeInfo.AUTH); + } + if ((decodedHost == null) && (host != null)) { + decodedHost = decode(host, DecodeInfo.HOST); + } + if (decodedPath == null) { + decodedPath = decode(path, DecodeInfo.PATH); + } + if ((decodedQuery == null) && (query != null)) { + decodedQuery = decode(query, DecodeInfo.QUERY); + } + if ((decodedFragment == null) && (fragment != null)) { + decodedFragment = decode(fragment, DecodeInfo.FRAG); + } + } + + /** + * Returns a hash-code value for this URI. The hash code is based upon all + * of the URI's components, and satisfies the general contract of the + * {@link Object#hashCode() Object.hashCode} method. + * + * @return A hash-code value for this URI + */ + public int hashCode() { + int h = hash; + if (h == 0) { + ensureComponentDecoded(); + h = hashIgnoringCase(0, scheme); + h = hash(h, decodedFragment); + h = hash(h, decodedPath); + h = hash(h, decodedQuery); + if (host != null) { + h = hash(h, decodedUserInfo); + h = hashIgnoringCase(h, decodedHost); + h += 1949 * port; + } else { + h = hash(h, decodedAuthority); + } + + if (h != 0) { + hash = h; + } else { + // don't allow 0 to avoid hashing again + hash = 0x0DEC0DED; + } + } + return h; + } + + /** + * Compares this URI to another object, which must be a URI. + * + *

When comparing corresponding components of two URIs, if one + * component is undefined but the other is defined then the first is + * considered to be less than the second. Unless otherwise noted, string + * components are ordered according to their natural, case-sensitive + * ordering as defined by the {@link String#compareTo(Object) + * String.compareTo} method. String components that are subject to + * encoding are compared by comparing their raw forms rather than their + * encoded forms. + * + *

The ordering of URIs is defined as follows:

+ * + * + * + *

This method satisfies the general contract of the {@link + * Comparable#compareTo(Object) Comparable.compareTo} + * method.

+ * + * @param that + * The object to which this URI is to be compared + * + * @return A negative integer, zero, or a positive integer as this URI is + * less than, equal to, or greater than the given URI + * + * @throws ClassCastException + * If the given object is not a URI + */ + public int compareTo(IRI that) { + int c; + if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0) + return c; + + this.ensureComponentDecoded(); + that.ensureComponentDecoded(); + + if (this.isOpaque()) { + if (that.isOpaque()) { + // Both opaque + if ((c = compare(this.decodedPath, that.decodedPath)) != 0) + return c; + if ((c = compare(this.decodedQuery, that.decodedQuery)) != 0) + return c; + return compare(this.decodedFragment, that.decodedFragment); + } + return +1; // Opaque > hierarchical + } else if (that.isOpaque()) { + return -1; // Hierarchical < opaque + } + + // Hierarchical + if ((this.host != null) && (that.host != null)) { + // Both server-based + if ((c = compare(this.decodedUserInfo, that.decodedUserInfo)) != 0) + return c; + if ((c = compareIgnoringCase(this.decodedHost, that.decodedHost)) != 0) + return c; + if ((c = this.port - that.port) != 0) + return c; + assert IRI.getHostType(decodedHost).compareTo(IRI.getHostType(decodedHost)) == 0; + } else { + assert this.authority == null || that.authority == null; + // At least one authority component must be null. Find out which + if ((c = compare(this.decodedAuthority, that.decodedAuthority)) != 0) return c; + } + + if ((c = compare(this.decodedPath, that.decodedPath)) != 0) return c; + if ((c = compare(this.decodedQuery, that.decodedQuery)) != 0) return c; + return compare(this.decodedFragment, that.decodedFragment); + } + + /** + * Returns the content of this URI as a string in its original Unicode form. + * + *

If this URI was created by invoking one of the factory methods in this + * class then a string equivalent to the original input string, or to the + * string computed from the originally-given components, as appropriate, is + * returned. Otherwise this URI was created by normalization, resolution, + * or relativization, and so a string is constructed from this URI's + * components according to the rules specified in RFC 3986, + * section 5.3.

+ * + * @return The string form of this URI, in its original Unicode form. + */ + public String toString() { + String s = string; + if (s == null) { + s = defineString(); + } + return s; + } + + private static StringBuilder buildString(StringBuilder sb, + String scheme, + String userInfo, String host, int port, + String path, + String query, String fragment) { + if (scheme != null) { + sb.append(scheme); + sb.append(':'); + } + if (host != null) { + sb.append("//"); + if (userInfo != null) { + sb.append(userInfo); + sb.append('@'); + } + boolean needBrackets = ((host.indexOf(':') >= 0) + && !host.startsWith("[") + && !host.endsWith("]")); + if (needBrackets) sb.append('['); + sb.append(host); + if (needBrackets) sb.append(']'); + if (port != -1) { + sb.append(':'); + sb.append(port); + } + } + if (path != null) + sb.append(path); + if (query != null) { + sb.append('?'); + sb.append(query); + } + if (fragment != null) { + sb.append('#'); + sb.append(fragment); + } + return sb; + } + + // used by resolve() + private static StringBuilder buildString(StringBuilder sb, + String scheme, + String authority, + String path, + String query, String fragment) { + if (scheme != null) { + sb.append(scheme); + sb.append(':'); + } + if (authority != null) { + sb.append("//"); + sb.append(authority); + } + if (path != null) + sb.append(path); + if (query != null) { + sb.append('?'); + sb.append(query); + } + if (fragment != null) { + sb.append('#'); + sb.append(fragment); + } + return sb; + } + + private String defineString() { + String s = string; + if (s != null) { + return s; + } + + assert host != null || authority == null; + string = s = buildString(new StringBuilder(), + scheme, + userInfo, host, port, + path, query, fragment).toString(); + return s; + } + + /** + * Returns a representation of this IRI that only contains US-ASCII + * characters. Conceptually mapping an IRI representation to its URI + * representation. + * + *

If this IRI does not contain any characters in the other + * category then an invocation of this method will return the same value as + * an invocation of the {@link #toString() toString} method. Otherwise + * this method works as if by invoking that method and then encoding the result.

+ * + * @return The string form of this URI, encoded as needed + * so that it only contains US-ASCII characters. + */ + @Override + public String toASCIIString() { + return encode(toString()); + } + + + /** + * Returns a decoded representation of this IRI that may contain unicode + * characters. Conceptually mapping a URI representation to its IRI + * representation. + * + *

If this URI does not contain any percent-encoded octets for all + * components, then an invocation of this method will return the same + * value as an invocation of the {@link #toString() toString} method. + * Otherwise percent-encoded other characters are converted to + * Unicode. Any percent-encoded octets which do not represent valid + * UTF-8 characters or which represent reserved characters or + * which are not allowed in IRIs, e.g. bidirectional formatting characters + * (LRM, RLM, LRE, RLE, LRO, RLO, and PDF), are not converted. + * + * @return The content of this URI as an IRI string, + * i.e. may contain non-US-ASCII characters. + * + */ + public String toIRIString() { + defineIRIString(); + return iriString; + } + + /** + * Returns a decoded representation of this IRI that may contain + * unicode characters, as well as printable non-US-ASCII + * characters which would have been accepted by a + * {@linkplain #parseLenient(String) lenient parser}. + * + *

This method acts as {@link #toString()}, but additionally + * {@linkplain #unquoteLenient(String) leniently decodes printable + * characters in US-ASCII that are not allowed in URIs}, but which are + * sometime used in queries (such as the pipe ({@code '|'}) character). + * + * @apiNote Note the resulting string may no longer be a + * valid IRI, as it may contain characters that are not valid in URIs, + * and could lead to dangerous misinterpretation if used in a wider + * context (such as an XML document) without precaution. + * + * @implSpec + *

This method is provided as a convenience method and is + * equivalent to calling {@code IRI.unquoteLenient(iri.toString())}. + * In addition for any string str for which + * {@code IRI.parseLenient(}str{@code)} doesn't fail, and then + *

IRI.parseLenient(str).toLenientString().equals(IRI.unquoteLenient(str))
+ * + * @return Returns a decoded representation of this IRI that + * may contain unicode characters, as well as printable non-US-ASCII + * character which are normally not allowed in an IRI or URI. + * + * @see #unquoteLenient(String) + * @see #parseLenient(String) + * @see #quoteLenient(String) + */ + public String toLenientString() { + defineString(); + // further leniently decode printable US-ASCII characters + // which are normally not used in URI. + return decode(string, DecodeInfo.LENIENT); + } + + // public static helpers + + /** + * Leniently pre-quote illegal printable US-ASCII characters. + * + * @apiNote + * RFC 3987 specifies that systems accepting IRIs may + * also deal with the printable characters in US-ASCII that are not + * allowed in URIs, namely {@code '<'}, {@code '>'}, {@code '"'}, + * space, {@code '{'}, {@code '}'}, {@code '|'}, {@code '\'}, + * {@code '^'}, and {@code '`'}. This method will leniently pre-quote + * these characters in the given string {@code s} so that the resulting + * string can be passed to {@link IRI#parseIRI(String)} or + * {@link #of(String)} without triggering an exception when + * they are encountered. + * + *

This method is provided as convenience for those APIs + * that produce strings in which these characters appear in raw + * unquoted form in IRI strings or IRI components. + * Note that blindly applying this method to any string + * whithout prior validation is not encouraged as the presence of + * these character in an IRI string could potentially be due to + * malicious input. + * + * @param s The string in which illegal printable US-ASCII characters + * should be leniently pre-quoted. May be {@code null}, in which + * case {@code null} is returned. + * + * @return A string in which illegal printable US-ASCII characters + * have been leniently pre-quoted. + * + * @see #unquoteLenient(String) + * @see #parseLenient(String) + * @see #toLenientString() + */ + public static String quoteLenient(String s) { + return quote(s, L_LENIENT, H_LENIENT, NonASCII.NOQUOTES); + } + + /** + * Leniently unquote illegal printable US-ASCII characters. + * + * @apiNote + * RFC 3987 specifies that systems accepting IRIs may + * also deal with the printable characters in US-ASCII that are not + * allowed in URIs, namely {@code '<'}, {@code '>'}, {@code '"'}, + * space, {@code '{'}, {@code '}'}, {@code '|'}, {@code '\'}, + * {@code '^'}, and {@code '`'}. This method will leniently unquote + * these characters if present in quoted form in the given string + * {@code s}, performing the opposite operation that + * {@link #quoteLenient(String)} might have done. + * + *

Note however that when applied to the full string + * representation of an IRI, such as returned by + * {@link #toString()}, {@link #toASCIIString()}, or + * {@link #toIRIString()}, the resulting string may no longer be a + * valid IRI, as it may contain characters that are not valid in URIs, + * and could lead to dangerous misinterpretation if used in a wider + * context (such as an XML document) without precaution. + * + * @param s The string in which illegal printable US-ASCII characters + * should be leniently pre-quoted. May be {@code null}, in which + * case {@code null} is returned. + * + * @return A string in which illegal printable US-ASCII characters + * have been leniently pre-quoted. + * + * @see #quoteLenient(String) + * @see #parseLenient(String) + * @see #toLenientString() + */ + public static String unquoteLenient(String s) { + return decode(s, DecodeInfo.LENIENT); + } + + /** + * Substitutes the percent character of any percent encoded octet found in + * {@code s} with "%25", thus re-encoding any percent encoded octet {@code %hh} + * into {@code %25hh}. + * + * @apiNote + * This method can be used on input arguments prior to calling a + * multi-argument factory method + * if the behaviour that was previously implemented by + * {@code java.net.URI} multi-argument constructors is desired. + * + * @implNote + * This method behaves as {@code String.replaceAll("%(([0-9]|[a-fA-F]){2})", "%25$1")}. + * + * @param s an input string + * + * @return a string in which any percent encoded octets are quoted + **/ + public static String quoteEncodedOctets(String s) { + int start=0, n=0, len=-1; + StringBuilder sb = null; + while ((n = s.indexOf('%', n)) > -1) { + if (len == -1) len = s.length(); + if (n < len -2) { + char c1, c2; + if (match(c1=s.charAt(n+1), L_HEX, H_HEX) + && match(c2=s.charAt(n+2), L_HEX, H_HEX)) { + if (sb == null) { + sb = new StringBuilder(len + 3); + } + sb.append(s, start, n); + sb.append('%'); + sb.append('2'); + sb.append('5'); + sb.append(c1); + sb.append(c2); + start = n = n+3; + } else n++; + } else { + break; + } + } + if (sb != null && start < len) { + sb.append(s, start, len); + } + return sb == null ? s : sb.toString(); + } + + /** + * Decodes any percent encoded octet corresponding to a valid UTF-8 sequence. + * If {@code useReplacementChar} is true, then any triplet corresponding to + * an invalid UTF-8 octet will be replaced by the replacement char {@code U+FFFD}. + * Otherwise, the invalid triplet is simply preserved. + * + * @apiNote + * This method can be used on the result returned by the + * raw getters + * if the behaviour that was previously implemented by {@code java.net.URI} + * getters is desired. + * + * @param s a string such as returned by non-raw getters. + * @param useReplacementChar whether an invalid percent-encoded octet should be + * replaced with the replacement char {@code U+FFFD} + * + * @return a string in which all valid percent encoded sequences have been + * been decoded. + **/ + public static String unquoteEncodedOctets(String s, boolean useReplacementChar) { + DecodeInfo info = useReplacementChar + ? DecodeInfo.REPLACE_INVALID : DecodeInfo.ALL_VALID; + return decode(s, info); + } + + // -- Serialization support -- + + /** + * Saves the content of this URI to the given serial stream. + * + *

The only serializable field of a URI instance is its {@code string} + * field. That field is given a value, if it does not have one already, + * and then the {@link ObjectOutputStream#defaultWriteObject()} + * method of the given object-output stream is invoked.

+ * + * @param os The object-output stream to which this object + * is to be written + */ + private void writeObject(ObjectOutputStream os) + throws IOException + { + defineString(); + os.defaultWriteObject(); // Writes the string field only + } + + /** + * Reconstitutes a URI from the given serial stream. + * + *

The {@link ObjectInputStream#defaultReadObject()} method is + * invoked to read the value of the {@code string} field. The result is + * then parsed in the usual way. + * + * @param is The object-input stream from which this object + * is being read + */ + private void readObject(ObjectInputStream is) + throws ClassNotFoundException, IOException + { + is.defaultReadObject(); + } + + /** + * Returns the {@code IRI} resulting from the parsed IRI-string. + * @return the {@code IRI} resulting from the parsed IRI-string + */ + private Object readResolve() throws ObjectStreamException { + try { + return new Parser(string).parse(true); + } catch (URISyntaxException x) { + InvalidObjectException y = new InvalidObjectException("Invalid URI"); + y.initCause(x); + throw y; + } + } + + + // -- End of public methods -- + + + // -- Utility methods for string-field comparison and hashing -- + + // These methods return appropriate values for null string arguments, + // thereby simplifying the equals, hashCode, and compareTo methods. + + // US-ASCII only + private static int toLower(char c) { + if ((c >= 'A') && (c <= 'Z')) + return c + ('a' - 'A'); + return c; + } + + // US-ASCII only + private static int toUpper(char c) { + if ((c >= 'a') && (c <= 'z')) + return c - ('a' - 'A'); + return c; + } + + private static boolean equal(String s, String t) { + if (s == t) return true; + if ((s != null) && (t != null)) { + if (s.length() != t.length()) + return false; + if (s.indexOf('%') < 0) + return s.equals(t); + int n = s.length(); + for (int i = 0; i < n;) { + char c = s.charAt(i); + char d = t.charAt(i); + if (c != '%') { + if (c != d) + return false; + i++; + continue; + } + if (d != '%') + return false; + i++; + if (toLower(s.charAt(i)) != toLower(t.charAt(i))) + return false; + i++; + if (toLower(s.charAt(i)) != toLower(t.charAt(i))) + return false; + i++; + } + return true; + } + return false; + } + + private static boolean equalIgnoringCase(String s, String t) { + if (s == t) return true; + if ((s != null) && (t != null)) { + return s.equalsIgnoreCase(t); + } + return false; + } + + private static int hash(int hash, String s) { + if (s == null) return hash; + return s.indexOf('%') < 0 ? hash * 127 + s.hashCode() + : normalizedHash(hash, s); + } + + + // US-ASCII only + private static int normalizedHash(int hash, String s) { + int h = 0; + for (int index = 0; index < s.length(); index++) { + char ch = s.charAt(index); + h = 31 * h + ch; + if (ch == '%') { + /* + * Process the next two encoded characters + */ + for (int i = index + 1; i < index + 3; i++) + h = 31 * h + toUpper(s.charAt(i)); + index += 2; + } + } + return hash * 127 + h; + } + + private static int hashIgnoringCase(int hash, String s) { + if (s == null) return hash; + return hash * 31 + s.toLowerCase().hashCode(); + } + + private static int compare(String s, String t) { + if (s == t) return 0; + if (s != null) { + if (t != null) + return s.compareTo(t); + else + return +1; + } else { + return -1; + } + } + + private static int compareIgnoringCase(String s, String t) { + if (s == t) return 0; + if (s != null) { + if (t != null) { + return s.compareToIgnoreCase(t); + } + return +1; + } else { + return -1; + } + } + + + // -- String construction -- + + // If a scheme is given then the path, if given, must be absolute + // +// private static void checkPath(String s, String scheme, String path) +// throws URISyntaxException +// { +// if (scheme != null) { +// if ((path != null) +// && ((path.length() > 0) && (path.charAt(0) != '/'))) +// throw new URISyntaxException(s, +// "Relative path in absolute URI"); +// } +// } + + // check consistency of hierarchical IRI parameters + // If scheme and authority are absent, and the path contains + // a : then it needs to be protected + // + private static String checkHierarchicalPath(String scheme, String authority, String userinfo, + String host, int port, String path, + boolean reject) + { + if (path == null) return path; + if (authority != null || host != null) { + if (!path.isEmpty() && path.charAt(0) != '/') { + assert reject; // components should have been checked if reject=false; + throw new IllegalArgumentException( + "relative path with non null authority component"); + } + return path; + } + if (path.startsWith("//")) { + assert reject; // components should have been checked if reject=false; + throw new IllegalArgumentException( + "path cannot start with // when no authority is provided"); + } + if (scheme != null) { + if (!path.isEmpty() && path.charAt(0) != '/') { + assert reject; // components should have been checked if reject=false; + throw new IllegalArgumentException( + "hierarchical path must be absolute or empty when a scheme is provided"); + } + return path; + } + + // now we have a null scheme, and a null authority - so a + // path starting with xxxx:vvvv could be parsed as a + // scheme. + int qm, ps=-1; + if ((qm = path.indexOf(':')) > -1 && + ((ps = path.indexOf('/')) > qm || ps == -1)) { + // if the path is of the form xxxx:vvv where none of + // the x is a slash then we need to protect the path + // by prepending "./" to avoid having it interpreted + // as a scheme. + // Alternatively - we could choose to throw IAE? + if (reject) + throw new IllegalArgumentException( + "path should start with \"./\" if its first segment has a ':'"); + path = "./" + path; + } + return path; + } + + private static void appendAuthority(StringBuilder sb, + String authority, + String userInfo, + String host, + int port) + { + if (host != null) { + sb.append("//"); + if (userInfo != null) { + sb.append(quote(userInfo, L_USERINFO, H_USERINFO)); + sb.append('@'); + } + boolean needBrackets = ((host.indexOf(':') >= 0) + && !host.startsWith("[") + && !host.endsWith("]") + && getHostType(host, false).isLiteral()); + if (needBrackets) sb.append('['); + sb.append(quoteHost(host)); + if (needBrackets) sb.append(']'); + if (port != -1) { + sb.append(':'); + sb.append(port); + } + } else if (authority != null) { + sb.append("//"); + if (!appendIfIPv6Literal(sb, authority, + L_AUTHORITY, + H_AUTHORITY)) + { + sb.append(quote(authority, + L_AUTHORITY, + H_AUTHORITY)); + } + } + } + + // + // if given authority contains IPv6 literal, append it to + // sb and return true; + // otherwise, do nothing but return false + // + private static boolean appendIfIPv6Literal(StringBuilder sb, + String authority, + long lowMask, + long highMask) + { + String doquote, dontquote; + if (!authority.isEmpty() && authority.charAt(0) == '[') { + int end = authority.indexOf(']'); + if (end != -1 && isIPLiteralAddress( + dontquote = authority.substring(0, end+1))) { + if (end == authority.length()) { + doquote = ""; + } else { + doquote = authority.substring(end + 1); + } + sb.append(dontquote); + sb.append(quote(doquote, + lowMask, + highMask)); + return true; + } + } else { + // + // don't quote IPv6 address piece inside authority + // if given authority = 'userinfo@[IPV6]:port' + // + int start = authority.indexOf("@["); + int end = start == -1 ? -1 : authority.indexOf(']', start); + if (start != -1 && end != -1 + && isIPLiteralAddress( + dontquote = authority.substring(start+1, end+1))) { + if (start > 0) { + sb.append(quote(authority.substring(0, start), + L_USERINFO, H_USERINFO)); + } + sb.append('@'); + sb.append(dontquote); + sb.append(quote(authority.substring(end+1), lowMask, highMask)); + return true; + } + } + + return false; + } + + private static String toString(String scheme, + String authority, + String userInfo, + String host, + int port, + String path, + String query, + String fragment) + { + StringBuilder sb = new StringBuilder(); + + // append scheme + if (scheme != null) { + sb.append(scheme); + sb.append(':'); + } + + // append authority + appendAuthority(sb, authority, userInfo, host, port); + + // append path + if (path != null) { + sb.append(quote(path, L_PATH, H_PATH)); + } + + // append query + if (query != null) { + sb.append('?'); + sb.append(quote(query, L_QUERY, H_QUERY, NonASCII.QUERY)); + } + + // append fragment + if (fragment != null) { + sb.append('#'); + sb.append(quote(fragment, L_FRAGMENT, H_FRAGMENT)); + } + + // done... + return sb.toString(); + } + + private void defineIRIString() { + if (iriString != null) return; + // Use decodeIRI to preserve reentrant IRI strings. + // The main difference is that %25 will always be preserved + // when it's followed by two hex digits. + // So if you have something like %2541 or %25%34%31 it will + // produce %2541 in the IRI string, not %41, ensuring that + // parsing the IRI string and echoing it back produces the + // same IRI string. + iriString = toString(scheme, + decodeIRI(authority, DecodeInfo.AUTH), + decodeIRI(userInfo, DecodeInfo.USER), + decodeIRI(host, DecodeInfo.HOST), + port, + decodeIRI(path, DecodeInfo.PATH), + decodeIRI(query, DecodeInfo.QUERY), + decodeIRI(fragment, DecodeInfo.FRAG)); + } + + // -- Normalization, resolution, and relativization -- + + // RFC3986 sec. 5.2 + private static String resolvePath(String base, String child, + boolean absolute) + { + int i = base.lastIndexOf('/'); + int cn = child.length(); + String path = ""; + + if (cn == 0) { + // 5.2 (6a) + if (i >= 0) + path = base.substring(0, i + 1); + } else { + StringBuilder sb = new StringBuilder(base.length() + cn); + // 5.2 (6a) + if (i >= 0) + sb.append(base, 0, i + 1); + // 5.2 (6b) + sb.append(child); + path = sb.toString(); + } + + // 5.2 (6c-f) + String np = normalize(path); + + // 5.2 (6g): If the result is absolute but the path begins with "../", + // then we simply leave the path as-is + + return np; + } + + // RFC3986 sec. 5.2 + private static IRI resolve(IRI base, IRI child) { + // check if child if opaque first so that NPE is thrown + // if child is null. + if (child.isOpaque() || base.isOpaque()) + return child; + + // 5.2.2 Target URI fields + String scheme; + String authority; + String userInfo ; + String host ; + int port; + String path; + String query; + String fragment; + + if (child.scheme != null) { + scheme = child.scheme; + authority = child.authority; + userInfo = child.userInfo; + host = child.host; + port = child.port; + path = normalize(child.path); + query = child.query; + } else { + if (child.authority != null) { + authority = child.authority; + userInfo = child.userInfo; + host = child.host; + port = child.port; + path = normalize(child.path); + query = child.query; + } else { + if (child.path.isEmpty()) { + path = base.path; + query = (child.query != null) ? child.query : base.query; + } else { + String p; + if (child.path.charAt(0) == '/') { + p = child.path; + } else { + p = mergePath(base, child); + } + path = normalize(p); + query = child.query; + } + authority = base.authority; + userInfo = base.userInfo; + host = base.host; + port = base.port; + } + scheme = base.scheme; + } + + fragment = child.fragment; + + // don't normalize authority + String input = (authority == null || !authority.endsWith(":")) + ? null // no need to eagerly define the string rep. + : buildString(new StringBuilder(), + scheme, authority, path, query, fragment).toString(); + + return new IRI(input, + scheme, + authority, + userInfo, + host, + port, + path, + query, + fragment); + } + + // RFC 3986 5.2.3 + private static String mergePath(IRI base, IRI child) { + StringBuilder sb = new StringBuilder(); + + if (base.authority != null && base.path.equals("")) { + sb.append('/').append(child.path); + return sb.toString(); + } else if (base.path.endsWith("/..") + || base.path.equals("..")) { + sb.append(base.path) + .append('/') + .append(child.path); + return sb.toString(); + } else { + int index = base.path.lastIndexOf('/'); + if (index != -1) { + sb.append(base.path.substring(0, index)) + .append('/') + .append(child.path); + return sb.toString(); + } else { + return child.path; + } + } + } + + // If the given URI's path is normal then return the URI; + // o.w., return a new URI containing the normalized path. + // + private static IRI normalize(IRI u) { + if (u.isOpaque()) return u; + + // normalize authority by removing superfluous colon + String na = (u.authority == null || !u.authority.endsWith(":")) + ? u.authority + : u.authority.substring(0, u.authority.length() -1); + + // normalize path + String np; + if (u.path.isEmpty()) { + np = u.path; + } else { + np = normalize(u.path); + } + + if (u.authority != null) { + // RFC 3986: 6.2.3. Scheme-Based Normalization: + // In general, a URI that uses the generic syntax for + // authority with an empty path should be normalized + // to a path of "/". + if (np == null || np.isEmpty()) { + np = "/"; + } + } + + // if nothing changed, we're done! + if (np == u.path && na == u.authority) { + return u; + } + + if (u.scheme == null) { + np = checkHierarchicalPath(u.scheme, na, u.userInfo, u.host, + u.port, np, false); + } + + return new IRI(null, + u.scheme, + na, + u.userInfo, + u.host, + u.port, + np, + u.query, + u.fragment); + } + + // If both URIs are hierarchical, their scheme and authority components are + // identical, and the base path is a prefix of the child's path, then + // return a relative URI that, when resolved against the base, yields the + // child; otherwise, return the child. + // + private static IRI relativize(IRI base, IRI child) { + // check if child if opaque first so that NPE is thrown + // if child is null. + if (child.isOpaque() || base.isOpaque()) + return child; + if (!equalIgnoringCase(base.scheme, child.scheme) + || !equal(base.authority, child.authority)) + return child; + + String bp = normalize(base.path); + String cp = normalize(child.path); + + if (!bp.equals(cp)) { + int last = bp.lastIndexOf('/'); + // if the base path has no slash, then it must be empty, + // or relative. A path cannot be relative if the authority + // is not null. + assert last != -1 || bp.isEmpty() || base.authority == null; + + // if (last == -1) we need to replace the whole bp with "". + // if (last > -1) we replace only the last element. + bp = bp.substring(0,last+1); + assert last == -1 && bp.isEmpty() || bp.charAt(last) == '/'; + + if (!cp.startsWith(bp)) + return child; + } + + String path = checkHierarchicalPath(null, null, null, null, -1, + cp.substring(bp.length()), false); + String query = child.query; + String fragment = child.fragment; + return new IRI(null, + null, + null, + null, + null, + -1, + path, + query, + fragment); + } + + + + // -- Path normalization -- + + // The following algorithm for path normalization avoids the creation of a + // string object for each segment, as well as the use of a string buffer to + // compute the final result, by using a single char array and editing it in + // place. The array is first split into segments, replacing each slash + // with '\0' and creating a segment-index array, each element of which is + // the index of the first char in the corresponding segment. We then walk + // through both arrays, removing ".", "..", and other segments as necessary + // by setting their entries in the index array to -1. Finally, the two + // arrays are used to rejoin the segments and compute the final result. + // + // This code is based upon src/solaris/native/java/io/canonicalize_md.c + + + // Check the given path to see if it might need normalization. A path + // might need normalization if it contains duplicate slashes, a "." + // segment, or a ".." segment. Return -1 if no further normalization is + // possible, otherwise return the number of segments found. + // + // This method takes a string argument rather than a char array so that + // this test can be performed without invoking path.toCharArray(). + // + private static int needsNormalization(String path) { + boolean normal = true; + int ns = 0; // Number of segments + int end = path.length() - 1; // Index of last char in path + int p = 0; // Index of next char in path + + // Skip initial slashes + while (p <= end) { + if (path.charAt(p) != '/') break; + p++; + } + if (p > 1) normal = false; + + // Scan segments + while (p <= end) { + + // Looking at "." or ".." ? + if ((path.charAt(p) == '.') + && ((p == end) + || ((path.charAt(p + 1) == '/') + || ((path.charAt(p + 1) == '.') + && ((p + 1 == end) + || (path.charAt(p + 2) == '/')))))) { + normal = false; + } + ns++; + + // Find beginning of next segment + while (p <= end) { + if (path.charAt(p++) != '/') + continue; + + // Skip redundant slashes + while (p <= end) { + if (path.charAt(p) != '/') break; + normal = false; + p++; + } + + break; + } + } + + return normal ? -1 : ns; + } + + + // Split the given path into segments, replacing slashes with nulls and + // filling in the given segment-index array. + // + // Preconditions: + // segs.length == Number of segments in path + // + // Postconditions: + // All slashes in path replaced by '\0' + // segs[i] == Index of first char in segment i (0 <= i < segs.length) + // + private static void split(char[] path, int[] segs) { + int end = path.length - 1; // Index of last char in path + int p = 0; // Index of next char in path + int i = 0; // Index of current segment + + // Skip initial slashes + while (p <= end) { + if (path[p] != '/') break; + path[p] = '\0'; + p++; + } + + while (p <= end) { + + // Note start of segment + segs[i++] = p++; + + // Find beginning of next segment + while (p <= end) { + if (path[p++] != '/') + continue; + path[p - 1] = '\0'; + + // Skip redundant slashes + while (p <= end) { + if (path[p] != '/') break; + path[p++] = '\0'; + } + break; + } + } + + if (i != segs.length) + throw new InternalError(); // ASSERT + } + + + // Join the segments in the given path according to the given segment-index + // array, ignoring those segments whose index entries have been set to -1, + // and inserting slashes as needed. Return the length of the resulting + // path. + // + // Preconditions: + // segs[i] == -1 implies segment i is to be ignored + // path computed by split, as above, with '\0' having replaced '/' + // + // Postconditions: + // path[0] .. path[return value] == Resulting path + // + private static int join(char[] path, int[] segs) { + int ns = segs.length; // Number of segments + int end = path.length - 1; // Index of last char in path + int p = 0; // Index of next path char to write + + if (path[p] == '\0') { + // Restore initial slash for absolute paths + path[p++] = '/'; + } + + for (int i = 0; i < ns; i++) { + int q = segs[i]; // Current segment + if (q == -1) + // Ignore this segment + continue; + + if (p == q) { + // We're already at this segment, so just skip to its end + while ((p <= end) && (path[p] != '\0')) + p++; + if (p <= end) { + // Preserve trailing slash + path[p++] = '/'; + } + } else if (p < q) { + // Copy q down to p + while ((q <= end) && (path[q] != '\0')) + path[p++] = path[q++]; + if (q <= end) { + // Preserve trailing slash + path[p++] = '/'; + } + } else + throw new InternalError(); // ASSERT false + } + + return p; + } + + + // Remove "." segments from the given path, and remove segment pairs + // consisting of a non-".." segment followed by a ".." segment. + // + private static void removeDots(char[] path, int[] segs) { + int ns = segs.length; + int end = path.length - 1; + + for (int i = 0; i < ns; i++) { + int dots = 0; // Number of dots found (0, 1, or 2) + + // Find next occurrence of "." or ".." + do { + int p = segs[i]; + if (path[p] == '.') { + if (p == end) { + dots = 1; + break; + } else if (path[p + 1] == '\0') { + dots = 1; + break; + } else if ((path[p + 1] == '.') + && ((p + 1 == end) + || (path[p + 2] == '\0'))) { + dots = 2; + break; + } + } + i++; + } while (i < ns); + if ((i > ns) || (dots == 0)) + break; + + if (dots == 1) { + // Remove this occurrence of "." + segs[i] = -1; + } else { + // If there is a preceding non-".." segment, remove both that + // segment and this occurrence of ".."; otherwise, leave this + // ".." segment as-is, unless if it's the first segment, and + // the path is absolute. In this latter case remove it. + // See RFC 3986, Section 5.2.4. Remove Dot Segments + // and Section 5.4.2. Abnormal Examples + int j; + for (j = i - 1; j >= 0; j--) { + if (segs[j] != -1) break; + } + if (j >= 0) { + int q = segs[j]; + if (!((path[q] == '.') + && (path[q + 1] == '.') + && (path[q + 2] == '\0'))) { + segs[i] = -1; + segs[j] = -1; + } + } else if (j == -1 && path[0] == '\0') { + segs[i] = -1; + } + } + } + } + + + // DEVIATION: If the normalized path is relative, and if the first + // segment could be parsed as a scheme name, then prepend a "." segment + // + private static void maybeAddLeadingDot(char[] path, int[] segs) { + + if (path[0] == '\0') + // The path is absolute + return; + + int ns = segs.length; + int f = 0; // Index of first segment + while (f < ns) { + if (segs[f] >= 0) + break; + f++; + } + if ((f >= ns) || (f == 0)) + // The path is empty, or else the original first segment survived, + // in which case we already know that no leading "." is needed + return; + + int p = segs[f]; + while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++; + if (p >= path.length || path[p] == '\0') + // No colon in first segment, so no "." needed + return; + + // At this point we know that the first segment is unused, + // hence we can insert a "." segment at that position + path[0] = '.'; + path[1] = '\0'; + segs[0] = 0; + } + + + // Normalize the given path string. A normal path string has no empty + // segments (i.e., occurrences of "//"), no segments equal to ".", and no + // segments equal to ".." that are preceded by a segment not equal to "..". + // In contrast to Unix-style pathname normalization, for URI paths we + // always retain trailing slashes. + // + private static String normalize(String ps) { + + // Does this path need normalization? + int ns = needsNormalization(ps); // Number of segments + if (ns < 0) + // Nope -- just return it + return ps; + + char[] path = ps.toCharArray(); // Path in char-array form + + // Split path into segments + int[] segs = new int[ns]; // Segment-index array + split(path, segs); + + // Remove dots + removeDots(path, segs); + + // Prevent scheme-name confusion + maybeAddLeadingDot(path, segs); + + // Join the remaining segments and return the result + String s = new String(path, 0, join(path, segs)); + if (s.equals(ps)) { + // string was already normalized + return ps; + } + return s; + } + + + + // -- Character classes for parsing -- + + // RFC2396 precisely specifies which characters in the US-ASCII charset are + // permissible in the various components of a URI reference. We here + // define a set of mask pairs to aid in enforcing these restrictions. Each + // mask pair consists of two longs, a low mask and a high mask. Taken + // together they represent a 128-bit mask, where bit i is set iff the + // character with value i is permitted. + // + // This approach is more efficient than sequentially searching arrays of + // permitted characters. It could be made still more efficient by + // precompiling the mask information so that a character's presence in a + // given mask could be determined by a single table lookup. + + // To save startup time, we manually calculate the low-/highMask constants. + // For reference, the following methods were used to calculate the values: + + // Compute the low-order mask for the characters in the given string + private static long lowMask(String chars) { + int n = chars.length(); + long m = 0; + for (int i = 0; i < n; i++) { + char c = chars.charAt(i); + if (c < 64) + m |= (1L << c); + } + return m; + } + + // Compute the high-order mask for the characters in the given string + private static long highMask(String chars) { + int n = chars.length(); + long m = 0; + for (int i = 0; i < n; i++) { + char c = chars.charAt(i); + if ((c >= 64) && (c < 128)) + m |= (1L << (c - 64)); + } + return m; + } + + // Compute a low-order mask for the characters + // between first and last, inclusive + private static long lowMask(char first, char last) { + long m = 0; + int f = Math.max(Math.min(first, 63), 0); + int l = Math.max(Math.min(last, 63), 0); + for (int i = f; i <= l; i++) + m |= 1L << i; + return m; + } + + // Compute a high-order mask for the characters + // between first and last, inclusive + private static long highMask(char first, char last) { + long m = 0; + int f = Math.max(Math.min(first, 127), 64) - 64; + int l = Math.max(Math.min(last, 127), 64) - 64; + for (int i = f; i <= l; i++) + m |= 1L << i; + return m; + } + + // Tell whether the given character is permitted by the given mask pair + private static boolean match(char c, long lowMask, long highMask) { + if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches. + return false; + if (c < 64) + return ((1L << c) & lowMask) != 0; + if (c < 128) + return ((1L << (c - 64)) & highMask) != 0; + return false; + } + + // Character-class masks, in reverse order from RFC3986 because + // initializers for static fields cannot make forward references. + + // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | + // "8" | "9" + private static final long L_DIGIT = 0x3FF000000000000L; // lowMask('0', '9'); + private static final long H_DIGIT = 0L; + + // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | + // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | + // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" + private static final long L_UPALPHA = 0L; + private static final long H_UPALPHA = 0x7FFFFFEL; // highMask('A', 'Z'); + + // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | + // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | + // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" + private static final long L_LOWALPHA = 0L; + private static final long H_LOWALPHA = 0x7FFFFFE00000000L; // highMask('a', 'z'); + + // alpha = lowalpha | upalpha + private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA; + private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA; + + // alphanum = alpha | digit + private static final long L_ALPHANUM = L_DIGIT | L_ALPHA; + private static final long H_ALPHANUM = H_DIGIT | H_ALPHA; + + // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | + // "a" | "b" | "c" | "d" | "e" | "f" + private static final long L_HEX = L_DIGIT; + private static final long H_HEX = 0x7E0000007EL; // highMask('A', 'F') | highMask('a', 'f'); + + // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" + // / "*" / "+" / "," / ";" / "=" + // TODO: inline value + private static final long L_SUB_DELIMS = lowMask("!$&'()*+,;="); + private static final long H_SUB_DELIMS = highMask("!$&'()*+,;="); + + // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" + // TODO: inline value + private static final long L_GEN_DELIMS = lowMask(":/?#[]@"); + private static final long H_GEN_DELIMS = highMask(":/?#[]@"); + + // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | + // "(" | ")" + // TODO: inline value + private static final long L_MARK = lowMask("-._~"); + private static final long H_MARK = highMask("-._~"); + + // unreserved = alphanum | mark + private static final long L_UNRESERVED = L_ALPHANUM | L_MARK; + private static final long H_UNRESERVED = H_ALPHANUM | H_MARK; + + // reserved = gen-delims / sub-delims + private static final long L_RESERVED = L_SUB_DELIMS | L_GEN_DELIMS; + private static final long H_RESERVED = H_SUB_DELIMS | H_GEN_DELIMS; + + // The zero'th bit is used to indicate that escape pairs and non-US-ASCII + // characters are allowed; this is handled by the scanEscape method below. + private static final long L_ESCAPED = 1L; + private static final long H_ESCAPED = 0L; + + // uric = reserved | unreserved | escaped + // private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED; + // private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED; + + // pchar = unreserved / pct-encoded / sub-delims / ":" / "@" + // TODO: inline value + private static final long L_PCHAR + = L_UNRESERVED | L_ESCAPED | L_SUB_DELIMS | lowMask(":@"); + private static final long H_PCHAR + = H_UNRESERVED | H_ESCAPED | H_SUB_DELIMS | highMask(":@"); + + // fragment = *( pchar / "/" / "?" ) + // TODO: inline value + private static final long L_FRAGMENT = L_PCHAR | lowMask("/?"); + private static final long H_FRAGMENT = H_PCHAR | highMask("/?"); + + // query = *( pchar / "/" / "?" ) + // TODO: inline value + private static final long L_QUERY = L_PCHAR | lowMask("/?"); + private static final long H_QUERY = H_PCHAR | highMask("/?"); // All valid path characters + + + // All valid path characters + // TODO: inline value + private static final long L_PATH = L_PCHAR | lowMask("/"); + private static final long H_PATH = H_PCHAR | highMask("/"); + + // Dash, for use in domainlabel and toplabel + // private static final long L_DASH = 0x200000000000L; // lowMask("-"); + // private static final long H_DASH = 0x0L; // highMask("-"); + + // Dot, for use in hostnames + private static final long L_DOT = 0x400000000000L; // lowMask("."); + private static final long H_DOT = 0x0L; // highMask("."); + + // Dash, for use in domainlabel and toplabel + private static final long L_DASH = 0x200000000000L; // lowMask("-"); + private static final long H_DASH = 0x0L; // highMask("-"); + + // Colon, for use in addresses + // TODO: inline value + private static final long L_COLON = lowMask(":"); + private static final long H_COLON = highMask(":"); + + // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + private static final long L_USERINFO + = L_UNRESERVED | L_ESCAPED | L_SUB_DELIMS | L_COLON; + private static final long H_USERINFO + = H_UNRESERVED | H_ESCAPED | H_SUB_DELIMS | H_COLON; + + // IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) + // This mask is only intended to cover the second part of the rule: + // 1*( unreserved / sub-delims / ":" ) + // though it also covers the first part as "v" 1*HEXDIG "." are all + // unreserved. + private static final long L_IPVFUTURE + = L_UNRESERVED | L_SUB_DELIMS | L_COLON; + private static final long H_IPVFUTURE + = H_UNRESERVED | H_SUB_DELIMS | H_COLON; + + // reg_name = *( unreserved / pct-encoded / sub-delims ) + private static final long L_REG_NAME + = L_UNRESERVED | L_ESCAPED | L_SUB_DELIMS; + private static final long H_REG_NAME + = H_UNRESERVED | H_ESCAPED | H_SUB_DELIMS; + + // authority = [ userinfo "@" ] host [ ":" port ] + // TODO: inline value + private static final long L_AUTHORITY + = L_USERINFO | L_REG_NAME | L_DIGIT | lowMask("@:"); + private static final long H_AUTHORITY + = H_USERINFO | H_REG_NAME | H_DIGIT | highMask("@:"); + + // scheme = alpha *( alpha | digit | "+" | "-" | "." ) + private static final long L_SCHEME = L_ALPHA | L_DIGIT | 0x680000000000L; // lowMask("+-."); + private static final long H_SCHEME = H_ALPHA | H_DIGIT; // | highMask("+-.") == 0L + + // scope_id = alpha | digit | "_" | "." + private static final long L_SCOPE_ID + = L_ALPHANUM | 0x400000000000L; // lowMask("_."); + private static final long H_SCOPE_ID + = H_ALPHANUM | 0x80000000L; // highMask("_."); + + // Masks used for lenient parsing + // TODO: inline value + private static final long L_NOTALLOWED = lowMask("<>\" {}|\\^`"); + private static final long H_NOTALLOWED = highMask("<>\" {}|\\^`"); + private static final long L_LENIENT = ~L_NOTALLOWED; + private static final long H_LENIENT = ~H_NOTALLOWED; + + + // -- Escaping and encoding -- + + private static final char[] hexDigits = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' + }; + + private static void appendEscape(StringBuilder sb, byte b) { + sb.append('%'); + sb.append(hexDigits[(b >> 4) & 0x0f]); + sb.append(hexDigits[(b >> 0) & 0x0f]); + } + + private static void appendEscape(CharBuffer cb, byte b) { + cb.append('%'); + cb.append(hexDigits[(b >> 4) & 0x0f]); + cb.append(hexDigits[(b >> 0) & 0x0f]); + } + + // deals with surrogate pair, returns the index of the last escaped + // char (either pos or pos+1) + private static int appendEncoded(StringBuilder sb, CharSequence s, int pos, char c) { + ByteBuffer bb = null; + try { + if (Character.isHighSurrogate(c) && pos < s.length() - 1) { + assert s.charAt(pos) == c; + char ca[] = {c, s.charAt(++pos)}; + assert Character.isLowSurrogate(ca[1]); + bb = ThreadLocalCoders.encoderFor(StandardCharsets.UTF_8) + .encode(CharBuffer.wrap(ca)); + } else { + bb = ThreadLocalCoders.encoderFor(StandardCharsets.UTF_8) + .encode(CharBuffer.wrap("" + c)); + } + } catch (CharacterCodingException x) { + assert false; + } + while (bb.hasRemaining()) { + int b = bb.get() & 0xff; + appendEscape(sb, (byte)b); + } + return pos; + } + + // RFC 3987 defines these two categories - which in this document are + // collapsed into the single category 'other'. + // + // ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF + // / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD + // / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD + // / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD + // / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD + // / %xD0000-DFFFD / %xE1000-EFFFD + // + // iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD + // + // These definitions leave several 'holes': + // + // 1. high-surrogate/low-surrogate used in UTF-16 encoding + // U+D800-U+DBFF and U+DC00-U+DFFF + // 2. non characters: U+FDD0-U+FDEF and chars ending with FFFE-FFFF + // 3. special chars U+FFF0-U+FFFD - which include the replacement char + // U+FFFD. + // + // 1. should not occur - since Java default encoding is UTF-16, then + // surrogate pairs should produce a code point and surrogate chars + // that don't produce a code point should form an illegal sequence. + // 2. These characters are not characters but they will not be + // considered as illegal sequences by the decoders. + // This class will need to handle them as bidi chars [TODO]. + // 3. Special character should be quoted - just as controls and spaces + // + // Additional restrictions: + // + // 4. In addition - RFC 3987 specifies that bidi-chars should not be rendered + // but should be quoted when mapping a URI to/from an IRI. + // We also don't decode bidi chars in decoded strings, but treat + // them as invalid sequences. + // + + // Special characters in the range U+FFF0-U+FFFD + private static boolean isSpecial(char c) { + return c >= 0xFFF0 && c <= 0xFFFD; + } + + // Non char in the Basic Multilingual Plane: + // U+FDD0-U+FDEF and U+FFFE-U+FFFF + private static boolean isNonCharBMP(char c) { + return c >= 0xFDD0 && (c <= 0xFDEF || c == 0xFFFE || c == 0xFFFF); + } + + private static boolean isNonChar(CharSequence s, int pos, char c) { + assert s.charAt(pos) == c; + if (Character.isHighSurrogate(c)) { + int cp = Character.codePointAt(s, pos); + int masked = cp & 0xFFFF; + // any codepoint ending with FFFF or FFFE is a + // non character + return masked == 0xFFFF || masked == 0xFFFE; + } else { + return isNonCharBMP(c); + } + } + + private static boolean isPrivate(CharSequence s, int pos, char c) { + if (c >= 0xE000 && c <= 0xF8FF) return true; // BMP + if (Character.isHighSurrogate(c)) { // supplementary planes + int cp = Character.codePointAt(s, pos); + if (cp >= 0xF0000) { + if (cp <= 0xFFFFD) return true; + return cp >= 0x100000 && cp <= 0x10FFFD; + } + } + return false; + } + + // + // test if the string contains bidi formatting character sequence + // at the given position: + // bidi chars value represent in UTF-8 character set + // LRM \u200E %E2%80%8E + // RLM \u200F %E2%80%8F + // LRE \u202A %E2%80%AA + // RLE \u202B %E2%80%AB + // PDF \u202C %E2%80%AC + // LRO \u202D %E2%80%AD + // RLO \u202E %E2%80%AE + // + // IRIs MUST NOT contain bidirectional formatting characters + // (LRM, RLM, LRE, RLE, LRO, RLO, and PDF). + private static boolean isBidi(char c) { + return c >= 0x200E && c <= 0x202E && (c <= 0x200F || c >= 0x202A); + } + + private static boolean mustStayEncoded(CharSequence s, int pos, char c) { + return isBidi(c) || isNonChar(s, pos, c); + } + + /** + * This enumeration is used to provide further indication on + * how a given host component was parsed by an {@link IRI} + * instance. + * + * @since TBD + * + * @see IRI#getHostType(String) + */ + public static enum HostType { + /** + * Represents a host component that parses as an IPv4 literal + * address (RFC 3986 Appendix A: IPv4address). + */ + IPv4, + + /** + * Represents a host component that parses as an IPv6 literal + * address (RFC 3986 Appendix A: IP-literal, IPv6address). + */ + IPv6, + + /** + * Represents a host component that parses as an IPvFuture literal + * address (RFC 3986 Appendix A: IP-literal, IPvFuture). + */ + IPvFuture, + + /** + * Represents a host component that parses as a reg-name + * and additionally conforms to the DNS syntax. + * (RFC 3986 Appendix A: reg-name, further conforming to + * RFC 2396 Appendix A: hostname) + */ + DNSRegName, + + /** + * Represents a host component that parses as a reg-name + * but does not necessarily conform to the DNS syntax. + * (Any name allowed by RFC 3987 Section 2.2: ireg-name + * which doesn't necessarily conform to RFC 2396 Appendix A: hostname) + */ + RegName, + + /** + * Represent a host component which is absent. + */ + None; + + /** + * Tells whether a host component was parsed as a + * literal address type. + * + * @apiNote + * + * This method returns true for {@link #IPv4}, {@link #IPv6} + * and {@link #IPvFuture}. + * + * @return True if this value represents a host + * component that was parsed as a literal + * address type, false otherwise. + */ + public boolean isLiteral() { + switch (this) { + case IPv4: return true; + case IPv6: return true; + case IPvFuture: return true; + default: return false; + } + } + + /** + * Tells whether a host component corresponds to an Internet + * literal address or host name. + * + * This method yields true for those syntax types which are + * known to be usable to connect to a host on the Internet + * without further syntax processing, such as IPv4 and IPv6 + * literals, as well as for host names that conform to the + * DNS syntax. + * + * @apiNote + * + * This method returns true for {@link #IPv4}, {@link #IPv6} + * and {@link #DNSRegName}, false otherwise. + * + * @return True if this value represents an address + * form that can be used to connect to + * a host on the Internet without further + * syntax processing. + */ + public boolean isInternetName() { + switch (this) { + case IPv4: return true; + case IPv6: return true; + case DNSRegName: return true; + default: return false; + } + } + } + + + /** + * Indicates how a given {@code hostString} component parses according to + * RFC 3987 grammar. + * This method can be called to provide further information about the + * syntactic form of the {@linkplain #getRawHostString() raw host string} + * or {@linkplain #getHostString() decoded host string} of an IRI. + * + *

    + *
  1. {@link HostType#None None}: if the given {@code hostString} is null.
  2. + *
  3. {@link HostType#IPv4 IPv4}: the given {@code hostString} can be parsed + * as an IPv4 literal address.
  4. + *
  5. {@link HostType#IPv6 IPv6}: the given {@code hostString} can be parsed + * as an IPv6 literal address.
  6. + *
  7. {@link HostType#IPvFuture IPvFuture}: the given {@code hostString} can + * be parsed as an IPvFuture literal address.
  8. + *
  9. {@link HostType#DNSRegName DNSRegName}: the given {@code hostString} + * can be parsed as a reg-name, and additionally conforms to the DNS + * syntax as specified by RFC 2386 hostname.
  10. + *
  11. {@link HostType#RegName RegName}: the given {@code hostString} + * is parsed as an ireg-name (but may be empty)
  12. + *
+ * + * @apiNote + * + * Usually, for any IRI u, calling {@code IRI.getHostType(u.getHostString())} + * or {@code IRI.getHostType(u.getRawHostString())} should yield the same result, + * except when the {@linkplain #getRawHostString() raw host string} contains + * unreserved US-ASCII characters in percent-encoded form. For example, + * the raw host string {@code "%41%42%43.example.com"} will parse as + * a {@linkplain HostType#RegName reg-name}, whereas its decoded form + * {@code "ABC.example.com"} will parse as a {@linkplain HostType#DNSRegName + * DNS reg-name}. + * When providing an IRI, whether to use the raw form or the decoded form of + * the host component is usually left up to the code that accepts the IRI. + * This method can help the caller decide whether the IRI it hands off, or that + * it accepts, is appropriate to use by the underlying APIs that it wants to + * call. + * + * @param hostString The host component string, as returned by one of + * {@link #getHost()}, {@link #getHostString()} or + * {@link #getRawHostString()}. + * + * @return A {@link HostType} value that indicates how the + * + */ + public static HostType getHostType(String hostString) { + // TODO: cache value during parsing? + return hostString == null ? HostType.None : getHostType(hostString, true); + } + + // + // quote the given host string if it is a reg-name, + // i.e. neither IPv4Address nor IPv6Address nor IPvFuture + // + private static String quoteHost(String host) { + HostType addressType = getHostType(host, false); + if (addressType.isLiteral()) { + // IPv4 doesn't need to be quoted + // % in IPv6 should not be quoted (it's the scope delimiter) + // IPvFuture doesn't allow percent encoded chars + return host; + } + // otherwise it's a reg-name - and anything within + // must be quoted. + return quote(host, L_REG_NAME, H_REG_NAME); + } + + // Returns true if the host is a literal IP address. + // This can be an IPv4 literal address, an IPv6 literal + // address, enclosed or not in square brackets, or + // an IPvFuture literal address (in which case square + // brackets around it are required) + private static boolean isIPLiteralAddress(String host) { + return getHostType(host, false).isLiteral(); + } + + // Figure out how a host component was parsed. + // If parseDNS is true, further extends the analysis to figure + // out if a RegName is a DNSRegName. + private static HostType getHostType(String host, boolean parseDNS) { + if (IPAddressUtil.isIPv4LiteralAddress(host) + && isStrictIPv4Address(host)) { + return HostType.IPv4; + } else { + boolean betweenBrackets = (host.startsWith("[") && host.endsWith("]")); + String literalIPv6 = null; + + // a literal IPv6 address may or may not be surrounded by brackets + // normalize it before being feeded to isIPv6LiteralAddress() + if (betweenBrackets) + literalIPv6 = host.substring(1, host.length() - 1); + else + literalIPv6 = host; + + if (IPAddressUtil.isIPv6LiteralAddress(literalIPv6)) + return HostType.IPv6; + // require brackets for IPvFuture + if (betweenBrackets && isIPvFuture(literalIPv6)) + return HostType.IPvFuture; + } + return (parseDNS && isDNSName(host)) ? HostType.DNSRegName : HostType.RegName; + } + + // Returns true if the literalIP string is an IPvFuture + // literal address (square brackets must have been + // removed before calling this method). + private static boolean isIPvFuture(String literalIP) + { + int p = 0; + int q; + int n = literalIP.length(); + + if (p >= n || literalIP.charAt(p++) != 'v') { // check and skip 'v' + return false; + } + + for (q = p; q < n && match(literalIP.charAt(q), L_HEX, H_HEX); q++); + if (q <= p || q >= n) return false; + p = q; + if (p >= n || literalIP.charAt(p++) != '.') { // check and skip '.' + return false; + } + + for (q = p; q < n && match(literalIP.charAt(q), + L_IPVFUTURE, H_IPVFUTURE); q++); + if (q <= p || q != n) { + return false; + } + + return true; + } + + // scans the next char in the char sequence. + private static int scanAscii(CharSequence input, int start, int end, char c) { + if ((start < end) && (input.charAt(start) == c)) + return start + 1; + return start; + } + + // scans a char sequence until a character that doesn't match the provided + // mask is found. Doesn't handle escape sequences. + private static int scanAscii(CharSequence input, int start, int n, long lowMask, long highMask) + { + int p = start; + while (p < n) { + char c = input.charAt(p); + if (match(c, lowMask, highMask)) { + p++; + continue; + } + break; + } + return p; + } + + // Scan a string of decimal digits whose value fits in a byte + // + private static int scanByte(CharSequence input, int start, int n) { + int p = start; + int q = scanAscii(input,p, n, L_DIGIT, H_DIGIT); + if (q <= p) return q; + if (Integer.parseInt(input, p, q, 10) > 255) return p; + return q; + } + + // Scan an IPv4 address. + // + // If the strict argument is true then we require that the given + // interval contain nothing besides an IPv4 address; if it is false + // then we only require that it start with an IPv4 address. + // + // If the interval does not contain or start with (depending upon the + // strict argument) a legal IPv4 address characters then we return -1 + // immediately; otherwise we insist that these characters parse as a + // legal IPv4 address and throw an exception on failure. + // + // We assume that any string of decimal digits and dots must be an IPv4 + // address. It won't parse as a hostname anyway, so making that + // assumption here allows more meaningful exceptions to be thrown. + // + private static int scanIPv4Address(CharSequence input, int start, int n, boolean strict) { + int p = start; + int q; + int m = scanAscii(input, p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT); + if ((m <= p) || (strict && (m != n))) + return -1; + for (;;) { + // Per RFC2732: At most three digits per byte + // Further constraint: Each element fits in a byte + if ((q = scanByte(input, p, m)) <= p) break; p = q; + if ((q = scanAscii(input, p, m, '.')) <= p) break; p = q; + if ((q = scanByte(input, p, m)) <= p) break; p = q; + if ((q = scanAscii(input, p, m, '.')) <= p) break; p = q; + if ((q = scanByte(input, p, m)) <= p) break; p = q; + if ((q = scanAscii(input, p, m, '.')) <= p) break; p = q; + if ((q = scanByte(input, p, m)) <= p) break; p = q; + if (q < m) break; + return q; + } + return -q -2; + } + + private static boolean isStrictIPv4Address(String host) { + int len = host.length(); + int res = scanIPv4Address(host, 0, len, true); + return res == len; + } + + // Returns true if the host name conforms to the DNS syntax + private static boolean isDNSName(String host) { + int p = 0, n = host.length(); + int q; + int l = -1; // Start of last parsed label + + do { + // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ] + q = scanAscii(host, p, n, L_ALPHANUM, H_ALPHANUM); + if (q <= p) + break; + l = p; + if (q > p) { + p = q; + q = scanAscii(host, p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH); + if (q > p) { + if (host.charAt(q - 1) == '-') + return false; + p = q; + } + } + q = scanAscii(host, p, n, '.'); + if (q <= p) + break; + p = q; + } while (p < n); + + if (p < n) return false; + + if (l < 0) return false; + + // for a fully qualified hostname check that the rightmost + // label starts with an alpha character. + if (l > 0 && !match(host.charAt(l), L_ALPHA, H_ALPHA)) { + return false; + } + + return true; + } + + // This interface is used to figure out which non-ASCII chars + // (greater than 0x80 (128)) are illegal and must be quoted. + // In query, iprivate chars are legal, but not elsewhere. + // + private interface NonASCII { + + // these chars are illegal in query and must be quoted + private static boolean basicQuoting(CharSequence s, int pos, char c) { + return Character.isSpaceChar(c) + || Character.isISOControl(c) + || isSpecial(c) + || mustStayEncoded(s, pos, c); + } + + // these chars are illegal everywhere else (except query) + // and must be quoted. + private static boolean defaultQuoting(CharSequence s, int pos, char c) { + return basicQuoting(s, pos, c) || isPrivate(s, pos, c); + } + + default boolean needQuoting(CharSequence s, int pos, char c) { + return defaultQuoting(s, pos, c); + } + + default int escapeLength(CharSequence s, int pos, char c) { + return Character.isHighSurrogate(c) + && ++pos < s.length() + && Character.isLowSurrogate(s.charAt(pos)) ? 2 : 1; + } + + // default quoting: control, space, private chars, etc... + static final NonASCII DEFAULT = new NonASCII() {}; + + // query quoting: same as default but allows private chars + static final NonASCII QUERY = new NonASCII() { + @Override + public boolean needQuoting(CharSequence s, int pos, char c) { + return NonASCII.basicQuoting(s, pos, c); + } + }; + + // no quoting: all non-ascii chars are allowed in unquoted form. + // this is used in case where the string is supposed to be already + // quoted: unquoted char may later cause an exception to be thrown. + static final NonASCII NOQUOTES = new NonASCII() { + @Override + public boolean needQuoting(CharSequence s, int pos, char c) { + return false; + } + }; + + static NonASCII quotingFor(String what) { + return "query".equals(what) ? QUERY : DEFAULT; + } + } + + // Quote any characters in s that are not permitted + // by the given mask pair + // + private static String quote(String s, long lowMask, long highMask) { + return quote(s, lowMask, highMask, NonASCII.DEFAULT); + } + + private static String quote(String s, long lowMask, long highMask, NonASCII quoting) { + if (s == null) + return null; + + StringBuilder sb = null; + boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0); + int len = s.length(); + for (int i = 0; i < len; i++) { + char c = s.charAt(i); + if (c < '\u0080') { + if (!match(c, lowMask, highMask) && !isEscaped(s, i)) { + if (sb == null) { + sb = new StringBuilder(); + sb.append(s, 0, i); + } + appendEscape(sb, (byte)c); + } else { + if (sb != null) + sb.append(c); + } + } else if (allowNonASCII + && quoting.needQuoting(s, i,c)) { + if (sb == null) { + sb = new StringBuilder(); + sb.append(s, 0, i); + } + i = appendEncoded(sb, s, i, c); + } else { + if (sb != null) + sb.append(c); + } + } + return (sb == null) ? s : sb.toString(); + } + + // + // To check if the given string has an escaped triplet + // at the given position + // + private static boolean isEscaped(CharSequence s, int pos) { + if (s == null || ((s.length() -2) <= pos)) + return false; + + return s.charAt(pos) == '%' + && match(s.charAt(pos + 1), L_HEX, H_HEX) + && match(s.charAt(pos + 2), L_HEX, H_HEX); + } + + // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets, + // assuming that s is otherwise legal + // + private static String encode(String s) { + int n = s.length(); + if (n == 0) + return s; + + // First check whether we actually need to encode + for (int i = 0;;) { + if (s.charAt(i) >= '\u0080') + break; + if (++i >= n) + return s; + } + + String ns = Normalizer.normalize(s, Normalizer.Form.NFC); + ByteBuffer bb = null; + try { + bb = ThreadLocalCoders.encoderFor(StandardCharsets.UTF_8) + .encode(CharBuffer.wrap(ns)); + } catch (CharacterCodingException x) { + assert false; + } + + StringBuilder sb = new StringBuilder(); + while (bb.hasRemaining()) { + int b = bb.get() & 0xff; + if (b >= 0x80) + appendEscape(sb, (byte)b); + else + sb.append((char)b); + } + return sb.toString(); + } + + private static int decode(char c) { + if ((c >= '0') && (c <= '9')) + return c - '0'; + if ((c >= 'a') && (c <= 'f')) + return c - 'a' + 10; + if ((c >= 'A') && (c <= 'F')) + return c - 'A' + 10; + assert false; + return -1; + } + + private static byte decode(char c1, char c2) { + return (byte)( ((decode(c1) & 0xf) << 4) + | ((decode(c2) & 0xf) << 0)); + } + + // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes + // that escapes are well-formed syntactically, i.e., of the form %XX. If a + // sequence of escaped octets is not valid UTF-8 then the erroneous octets + // are copied to the result string. + // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal + // with a scope_id + // + // private static String decode(String s) { + // return decode(s, DecodeInfo.DEFAULT); + // } + + /** + * The DecodeInfo interface is used to tweak the decoding algorithm + * according to different component needs. For instance, %2F should + * be preserved when decoding path, but not when decoding fragments. + * Similarly %40 and %2A should be preserved when decoding authority, + * but not when decoding host, etc... + */ + private interface DecodeInfo { + default boolean ignorePercentInBrackets() { return false; } + default boolean useReplacementChar() { return false; } + default boolean canContainPercent() { return false; } + default boolean isComposed() { return false;} + default boolean preservePercentEncoding(CharSequence s, int pos, char c) { + return mustStayEncoded(s, pos, c); + } + static final DecodeInfo DEFAULT = new DecodeInfo() {}; + + static final DecodeInfo USER = DEFAULT; + + // We should preserve % in brackets when decoding + // host. The only place where brackets are found + // are for IPv6 literal and IPvFuture. + // IPvFuture can't contain %encoded octets, + // and encoded reg-name can't contain decoded brackets + // so return true will work here. + static final DecodeInfo HOST = new DecodeInfo() { + @Override + public boolean ignorePercentInBrackets() { return true; } + }; + + // it is not safe to decode %2F in path + static final DecodeInfo PATH = new DecodeInfo() { + @Override + public boolean preservePercentEncoding(CharSequence s, int pos, char c) { + return c == '/' || mustStayEncoded(s, pos, c); + } + }; + + // no special handling for %2F in query. + // A URI string should be quoteEncodedOctets() before being + // embedded in a query string. + static final DecodeInfo QUERY = DEFAULT; + + // it is safe to decode %2F in fragment, + static final DecodeInfo FRAG = DEFAULT; + + // Decodes a scheme specific part string. + // We must not decode /@[]:? if found encoded in + // encoded SSP, as the result would no longer + // be parsable. + // The only place where we can find non encoded + // brackets is in IP literal - where % is a + // scope - so don't try to decode %encoded octets + // when found in brackets. + static final DecodeInfo SSP = new DecodeInfo() { + @Override + public boolean ignorePercentInBrackets() { return true; } + @Override + public boolean isComposed() { return true; } + @Override + public boolean preservePercentEncoding(CharSequence s, int pos, char c) { + return c == '/' || c == '@' || c == ':' + || c == '[' || c == ']' || c == '?' + || mustStayEncoded(s, pos, c); + } + }; + + // Decodes an authority string. + // We must not decode /@[]: if found encoded in + // encoded authority, as the result would no longer + // be parsable. + // The only place where we can find non encoded + // brackets is in IP literal - where % is a + // scope - so don't try to decode %encoded octets + // when found in brackets. + static final DecodeInfo AUTH = new DecodeInfo() { + @Override + public boolean isComposed() { return true; } + @Override + public boolean ignorePercentInBrackets() { return true; } + @Override + public boolean preservePercentEncoding(CharSequence s, int pos, char c) { + return c == '@' || c == ':' || c == '[' + || c == ']' || mustStayEncoded(s, pos, c); + } + }; + + // Leniently decodes US-ASCII printable chars + static final DecodeInfo LENIENT = new DecodeInfo() { + @Override + public boolean canContainPercent() { return true; } + @Override + public boolean preservePercentEncoding(CharSequence s, int pos, char c) { + return c >= 0x80 || match(c, L_LENIENT, H_LENIENT); + } + }; + + // Decodes all percent encoded sequences, except invalid + // sequences. + static final DecodeInfo ALL_VALID = new DecodeInfo() { + @Override + public boolean canContainPercent() { return true; } + @Override + public boolean preservePercentEncoding(CharSequence s, int pos, char c) { + return isNonChar(s, pos, c); + } + }; + + // Decodes all percent encoded sequences, replace invalid + // sequences with U+FFFD. + static final DecodeInfo REPLACE_INVALID = new DecodeInfo() { + @Override + public boolean useReplacementChar() { return true; } + @Override + public boolean canContainPercent() { return true; } + @Override + public boolean preservePercentEncoding(CharSequence s, int pos, char c) { + return isNonChar(s, pos, c); + } + }; + + } + + // This method was introduced as a generalization of URI.decode method + // to provide a fix for JDK-8037396 + private static String decode(String s, DecodeInfo info) { + return decode(s, info, false); + } + + private static String decodeIRI(String s, DecodeInfo info) { + return decode(s, info, true); + } + + private static String decode(String s, DecodeInfo info, boolean toIRIString) { + if (s == null) + return s; + int n = s.length(); + if (n == 0) + return s; + if (s.indexOf('%') < 0) + return s; + + StringBuilder sb = new StringBuilder(n); + ByteBuffer bb = ByteBuffer.allocate(n); + CharBuffer cb = CharBuffer.allocate(n); + CharsetDecoder dec = ThreadLocalCoders.decoderFor(StandardCharsets.UTF_8) + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + // This is not horribly efficient, but it will do for now + char c = s.charAt(0); + boolean betweenBrackets = false; + boolean ignorePercentInBracket = info.ignorePercentInBrackets(); + boolean canContainPercent = info.canContainPercent(); + + for (int i = 0; i < n;) { + assert c == s.charAt(i); // Loop invariant + if (c == '[') { + betweenBrackets = true; + } else if (betweenBrackets && c == ']') { + betweenBrackets = false; + } + + if (c != '%' || (betweenBrackets && ignorePercentInBracket) + || (canContainPercent && !isEscaped(s, i))) { + sb.append(c); + ++i; + if (i >= n) continue; + c = s.charAt(i); + continue; + } + bb.clear(); + int ui = i; + for (;;) { + assert (n - i >= 2); + bb.put(decode(s.charAt(i+1), s.charAt(i+2))); + if ((i += 3) >= n) + break; + c = s.charAt(i); + if (c != '%' || canContainPercent && !isEscaped(s, i)) + break; + } + bb.flip(); + cb.clear(); + dec.reset(); + CoderResult cr; + do { + cr = dec.decode(bb, cb, true); + if (cr.isMalformed() || cr.isUnmappable()) { + // eat one byte at the current position + // and try to decode again + if (cb.position() > 0) { + // pass n to prevent the method from looking up + // the next character in the string. + appendDecoded(sb, s, n, cb, info, toIRIString); + assert !cb.hasRemaining(); + cb.clear(); + } + if (info.useReplacementChar()) { + for (int j = cr.length() ; j>0 ; j--) bb.get(); + sb.append("\ufffd"); + } else { + for (int j = cr.length() ; j>0 ; j--) { + appendEscape(sb, bb.get()); + } + } + } + } while (!cr.isUnderflow()); + cr = dec.flush(cb); + assert cr.isUnderflow(); + appendDecoded(sb, s, i, cb, info, toIRIString); + } + + return sb.toString(); + } + + // Append the characters decoded in 'cb' to the string builder 'sb', + // re-encoding them if needed with respect to DecodeInfo. + private static void appendDecoded(StringBuilder sb, String s, int i, + CharBuffer cb, DecodeInfo info, boolean toIRIString) { + cb.flip(); + boolean composed = info.isComposed(); + while(cb.hasRemaining()) { + // check whether character that were percent + // encoded needs to stay percent encoded. + // We must be careful here because our CharSequence + // is a CharBuffer. Therefore charAt() is relative to + // the current position in the buffer. + char ch = cb.charAt(0); + int r; + char c1, c2, c3; + if (info.preservePercentEncoding(cb, 0, ch) + || (toIRIString || composed) && ch == '%' && (r=cb.remaining()) >= 1 + && s.length() > (r > 2 ? i-1 : r > 1 ? i : (i+1)) + && match(c1 = r > 1 ? cb.charAt(1) : s.charAt(i), L_HEX, H_HEX) + && match(c2 = r > 2 ? cb.charAt(2) : s.charAt(r > 1 ? i : (i+1)), L_HEX, H_HEX) + //&& ((c3 = (char)(decode(c1,c2) & 0xFF)) < 0x80) + //&& !info.preservePercentEncoding(String.valueOf(c3), 0, c3) + ) { + if (ch < 0x80) { + appendEscape(sb, (byte)cb.get()); // advance + } else { + int consumed = appendEncoded(sb, cb, 0, ch) + 1; + cb.position(cb.position() + consumed); // advance 1 or 2 + } + } else { + sb.append(cb.get()); + } + } + + } + + + // -- Parsing -- + + // For convenience we wrap the input URI string in a new instance of the + // following internal class. This saves always having to pass the input + // string as an argument to each internal scan/parse method. + + private static class Parser { + + private String input; // URI input string + private String scheme; + private String authority; + private String userInfo; + private String host; + private int port = -1; + private String path; + private String query; + private String fragment; + + private Parser(String s) { + input = s; + } + + // -- Methods for throwing URISyntaxException in various ways -- + + private void fail(String reason) throws URISyntaxException { + throw new URISyntaxException(input, reason); + } + + private void fail(String reason, int p) throws URISyntaxException { + throw new URISyntaxException(input, reason, p); + } + + private void failExpecting(String expected, int p) + throws URISyntaxException + { + fail("Expected " + expected, p); + } + + + // -- Simple access to the input string -- + + // Tells whether start < end and, if so, whether charAt(start) == c + // + private boolean at(int start, int end, char c) { + return (start < end) && (input.charAt(start) == c); + } + + // Tells whether start + s.length() < end and, if so, + // whether the chars at the start position match s exactly + // + private boolean at(int start, int end, String s) { + int p = start; + int sn = s.length(); + if (sn > end - p) + return false; + int i = 0; + while (i < sn) { + if (input.charAt(p++) != s.charAt(i)) { + break; + } + i++; + } + return (i == sn); + } + + + // -- Scanning -- + + // The various scan and parse methods that follow use a uniform + // convention of taking the current start position and end index as + // their first two arguments. The start is inclusive while the end is + // exclusive, just as in the String class, i.e., a start/end pair + // denotes the left-open interval [start, end) of the input string. + // + // These methods never proceed past the end position. They may return + // -1 to indicate outright failure, but more often they simply return + // the position of the first char after the last char scanned. Thus + // a typical idiom is + // + // int p = start; + // int q = scan(p, end, ...); + // if (q > p) + // // We scanned something + // ...; + // else if (q == p) + // // We scanned nothing + // ...; + // else if (q == -1) + // // Something went wrong + // ...; + + + // Scan a specific char: If the char at the given start position is + // equal to c, return the index of the next char; otherwise, return the + // start position. + // + private int scan(int start, int end, char c) { + if ((start < end) && (input.charAt(start) == c)) + return start + 1; + return start; + } + + // Scan forward from the given start position. Stop at the first char + // in the err string (in which case -1 is returned), or the first char + // in the stop string (in which case the index of the preceding char is + // returned), or the end of the input string (in which case the length + // of the input string is returned). May return the start position if + // nothing matches. + // + private int scan(int start, int end, String err, String stop) { + int p = start; + while (p < end) { + char c = input.charAt(p); + if (err.indexOf(c) >= 0) + return -1; + if (stop.indexOf(c) >= 0) + break; + p++; + } + return p; + } + + // Scan forward from the given start position. Stop at the first char + // in the stop string (in which case the index of the preceding char is + // returned), or the end of the input string (in which case the length + // of the input string is returned). May return the start position if + // nothing matches. + // + private int scan(int start, int end, String stop) { + int p = start; + while (p < end) { + char c = input.charAt(p); + if (stop.indexOf(c) >= 0) + break; + p++; + } + return p; + } + + // Scan a potential escape sequence, starting at the given position, + // with the given first char (i.e., charAt(start) == c). + // + // This method assumes that if escapes are allowed then visible + // non-US-ASCII chars are also allowed. + // + private int scanEscape(int start, int n, char first, NonASCII quoting) + throws URISyntaxException + { + int p = start; + char c = first; + if (c == '%') { + // Process escape pair + if ((p <= n - 3) + && match(input.charAt(p + 1), L_HEX, H_HEX) + && match(input.charAt(p + 2), L_HEX, H_HEX)) { + return p + 3; + } + fail("Malformed escape pair", p); + } else if ((c > 128) + && !quoting.needQuoting(input, p, c)) { + // Take into account surrogate pairs + return p + quoting.escapeLength(input, p, c); + } + return p; + } + + // Scan chars that match the given mask pair + // + private int scan(int start, int n, long lowMask, long highMask) + throws URISyntaxException + { + return scan(start, n, lowMask, highMask, NonASCII.DEFAULT); + } + + private int scan(int start, int n, long lowMask, long highMask, NonASCII quoting) + throws URISyntaxException + { + int p = start; + while (p < n) { + char c = input.charAt(p); + if (match(c, lowMask, highMask)) { + p++; + continue; + } + if ((lowMask & L_ESCAPED) != 0) { + int q = scanEscape(p, n, c, quoting); + if (q > p) { + p = q; + continue; + } + } + break; + } + return p; + } + + // Check that each of the chars in [start, end) matches the given mask + // + private void checkChars(int start, int end, + long lowMask, long highMask, + String what) + throws URISyntaxException + { + // NonASCII.quotingFor(what) relies on the fact that what = "query" + // when we parse query strings... + int p = scan(start, end, lowMask, highMask, NonASCII.quotingFor(what)); + if (p < end) + fail("Illegal character in " + what, p); + } + + // Check that the char at position p matches the given mask + // + private void checkChar(int p, + long lowMask, long highMask, + String what) + throws URISyntaxException + { + checkChars(p, p + 1, lowMask, highMask, what); + } + + + // -- Parsing -- + + // URI-reference = URI / relative-ref + // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] + // relative-ref = relative-part [ "?" query ] [ "#" fragment ] + // + private IRI parse(boolean rsa) throws URISyntaxException { + int n = input.length(); + int p = scan(0, n, "/?#", ":"); + int ssp; + if ((p >= 0) && at(p, n, ':')) { + if (p == 0) + failExpecting("scheme name", 0); + checkChar(0, L_ALPHA, H_ALPHA, "scheme name"); + checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name"); + scheme = input.substring(0, p); + p++; // Skip ':' + ssp = p; + p = parseHierpart(p, n); + } else { + ssp = 0; + p = parseRelativepart(0, n); + } + if (at(p, n, '?')) { + int q = scan(p, n, "", "#"); + checkChars(p + 1, q, L_QUERY, H_QUERY, "query"); + query = input.substring(p + 1, q); + p = q; + } + if (at(p, n, '#')) { + checkChars(p + 1, n,L_FRAGMENT, H_FRAGMENT, "fragment"); + fragment = input.substring(p + 1, n); + p = n; + } + if (p < n) + fail("end of URI", p); + + assert host != null || authority == null; + return new IRI(input, scheme, authority, + userInfo, host, port, path, query, fragment); + } + + // hier-part = "//" authority path-abempty + // / path-absolute + // / path-rootless, i.e. opaque + // / path-empty + private int parseHierpart(int start, int n) + throws URISyntaxException + { + int p = start; + if (at(p, n, '/') && at(p + 1, n, '/')) { + p += 2; + int q = scan(p, n, "/?#"); + if (q >= p) { + p = parseAuthority(p, q); + } else if (q <= n) { + // empty authority + } else + failExpecting("authority", p); + } + + return parsePath(p, n); + } + + // relative-part = "//" authority path-abempty + // / path-absolute + // / path-noscheme + // / path-empty + private int parseRelativepart(int start, int n) + throws URISyntaxException { + int p = start; + if (at(p, n, '/') && at(p + 1, n, '/')) { + p += 2; + int q = scan(p, n, "", "/?#"); + if (q >= p) { + p = parseAuthority(p, q); + } else if (q < n) { + // empty authority + } else + failExpecting("authority", p); + } + return parsePath(p, n); + } + + + private int parsePath(int start, int n) + throws URISyntaxException + { + int p = start; + int q = scan(p, n, "", "?#"); + checkChars(p, q, L_PATH, H_PATH, "path"); + path = input.substring(p, q); // May be "" + return q; + } + + // [@][:] + // + private int parseAuthority(int start, int n) + throws URISyntaxException + { + int p = start; + int q; + URISyntaxException ex = null; + + // userinfo + q = scan(p, n, "/?#", "@"); + if ((q >= p) && at(q, n, '@')) { + checkChars(p, q, L_USERINFO, H_USERINFO, "user info"); + userInfo = input.substring(p, q); + p = q + 1; // Skip '@' + } + + // host = IP-literal / IPv4address / reg-name + if (at(p, n, '[')) { + // IPv6address + // DEVIATION from RFC3986: Support scope id + p++; + q = scan(p, n, "/?#", "]"); + if ((q > p) && at(q, n, ']')) { + // look for a "%" scope id + int r = scan(p, q, "%"); + if (r != q) { + if (r+1 == q) { + fail ("scope id expected"); + } + parseIPv6Reference(p, r); + checkChars(r+1, q, L_SCOPE_ID, H_SCOPE_ID, + "scope id"); + } else { + parseIPv6ReferenceOrIPvFuture(p, q); + } + host = input.substring(p - 1, q + 1); + p = q + 1; + } else { + failExpecting("closing bracket for IPv6 or IPvFuture address", q); + } + } else { + q = parseIPv4Address(p, n); + if (q <= p) + q = parseRegname(p, n); + p = q; + } + + // port + if (at(p, n, ':')) { + p++; + q = scan(p, n, "/"); + if (q > p) { + checkChars(p, q, L_DIGIT, H_DIGIT, "port number"); + try { + port = Integer.parseInt(input, p, q, 10); + } catch (NumberFormatException x) { + fail("Malformed port number", p); + } + p = q; + } + } + if (p < n) + failExpecting("port number", p); + + authority = input.substring(start, n); + + return n; + } + + // Scan an IPv4 address. + // + // If the strict argument is true then we require that the given + // interval contain nothing besides an IPv4 address; if it is false + // then we only require that it start with an IPv4 address. + // + // If the interval does not contain or start with (depending upon the + // strict argument) a legal IPv4 address characters then we return -1 + // immediately; otherwise we insist that these characters parse as a + // legal IPv4 address and throw an exception on failure. + // + // We assume that any string of decimal digits and dots must be an IPv4 + // address. It won't parse as a hostname anyway, so making that + // assumption here allows more meaningful exceptions to be thrown. + // + private int scanIPv4Address(int start, int n, boolean strict) + throws URISyntaxException + { + int p = start; + int q = IRI.scanIPv4Address(input, start, n, strict); + if (q == -1) return -1; + if (q < -1) { + fail("Malformed IPv4 address", -q - 2); + } + return q; + } + + // Take an IPv4 address: Throw an exception if the given interval + // contains anything except an IPv4 address + // + private int takeIPv4Address(int start, int n, String expected) + throws URISyntaxException + { + int p = scanIPv4Address(start, n, true); + if (p <= start) + failExpecting(expected, start); + return p; + } + + // Attempt to parse an IPv4 address, returning -1 on failure but + // allowing the given interval to contain [:] after + // the IPv4 address. + // + private int parseIPv4Address(int start, int n) { + int p; + + try { + p = scanIPv4Address(start, n, false); + } catch (URISyntaxException x) { + return -1; + } catch (NumberFormatException nfe) { + return -1; + } + + if (p > start && p < n) { + // IPv4 address is followed by something - check that + // it's a ":" as this is the only valid character to + // follow an address. + if (input.charAt(p) != ':') { + p = -1; + } + } + + if (p > start) + host = input.substring(start, p); + + return p; + } + + // + // reg-name = *( unreserved / pct-encoded / sub-delims ) + // + // One corner case is that RFC3986's rule authority has replaced + // RFC2396's rule server. But hostname as in 2396 can't be empty, + // while host as in 3986 can. The context is: + // hier-part = "//" authority path-abempty + // authority = [ userinfo "@" ] host [ ":" port ] + // host = IP-literal / IPv4address / reg-name + // path-abempty = *( "/" segment ) + // So if reg-name is empty, the authority component must be + // something like [ userinfo "@" ][ ":" port ], i.e. a :-sign + // or a /-sign is anticipated if empty reg-name. + // + private int parseRegname(int start, int n) + throws URISyntaxException + { + int p = start; + int q = scan(p, n, L_REG_NAME, H_REG_NAME); + if (q < n && input.charAt(q) != ':' && input.charAt(q) != '/') + fail("Illegal character in hostname", q); + host = input.substring(start, q); + return q; + } + + + // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture + // + // Bug: The grammar in RFC2373 Appendix B does not allow addresses of + // the form ::12.34.56.78, which are clearly shown in the examples + // earlier in the document. Here is the original grammar: + // + // IPv6address = hexpart [ ":" IPv4address ] + // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] + // hexseq = hex4 *( ":" hex4) + // hex4 = 1*4HEXDIG + // + // We therefore use the following revised grammar: + // + // IPv6address = hexseq [ ":" IPv4address ] + // | hexseq [ "::" [ hexpost ] ] + // | "::" [ hexpost ] + // hexpost = hexseq | hexseq ":" IPv4address | IPv4address + // hexseq = hex4 *( ":" hex4) + // hex4 = 1*4HEXDIG + // + // This covers all and only the following cases: + // + // hexseq + // hexseq : IPv4address + // hexseq :: + // hexseq :: hexseq + // hexseq :: hexseq : IPv4address + // hexseq :: IPv4address + // :: hexseq + // :: hexseq : IPv4address + // :: IPv4address + // :: + // + // Additionally we constrain the IPv6 address as follows :- + // + // i. IPv6 addresses without compressed zeros should contain + // exactly 16 bytes. + // + // ii. IPv6 addresses with compressed zeros should contain + // less than 16 bytes. + + private int ipv6byteCount = 0; + + private int parseIPv6Reference(int start, int n) + throws URISyntaxException + { + int p = start; + int q; + boolean compressedZeros = false; + + q = scanHexSeq(p, n); + + if (q > p) { + p = q; + if (at(p, n, "::")) { + compressedZeros = true; + p = scanHexPost(p + 2, n); + } else if (at(p, n, ':')) { + p = takeIPv4Address(p + 1, n, "IPv4 address"); + ipv6byteCount += 4; + } + } else if (at(p, n, "::")) { + compressedZeros = true; + p = scanHexPost(p + 2, n); + } + if (p < n) + fail("Malformed IPv6 address", start); + if (ipv6byteCount > 16) + fail("IPv6 address too long", start); + if (!compressedZeros && ipv6byteCount < 16) + fail("IPv6 address too short", start); + if (compressedZeros && ipv6byteCount == 16) + fail("Malformed IPv6 address", start); + + return p; + } + + + // + // IPvFuture parsing :- + // IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) + // + private int parseIPvFuture(int start, int n) + throws URISyntaxException + { + int p = start; + int q; + + if (!at(p++, n, 'v')) { // check and skip 'v' + fail("Malformed IPvFuture address", p); + } + q = scan(p, n, L_HEX, H_HEX); + if (q <= p) { + fail("Malformed IPvFuture address", q); + } + + p = q; + if (!at(p++, n, '.')) { // check and skip '.' + fail("Malformed IPvFuture address", p); + } + + q = scan(p, n, L_UNRESERVED | L_SUB_DELIMS | L_COLON, + H_UNRESERVED | H_SUB_DELIMS | H_COLON); + + if (q <= p || q != n) { + fail("Malformed IPvFuture address", q); + } + + return q; + } + + private int parseIPv6ReferenceOrIPvFuture(int start, int n) + throws URISyntaxException + { + if (input.charAt(start) == 'v') { + return parseIPvFuture(start, n); + } else { + return parseIPv6Reference(start, n); + } + } + + private int scanHexPost(int start, int n) + throws URISyntaxException + { + int p = start; + int q; + + if (p == n) + return p; + + q = scanHexSeq(p, n); + if (q > p) { + p = q; + if (at(p, n, ':')) { + p++; + p = takeIPv4Address(p, n, "hex digits or IPv4 address"); + ipv6byteCount += 4; + } + } else { + p = takeIPv4Address(p, n, "hex digits or IPv4 address"); + ipv6byteCount += 4; + } + return p; + } + + // Scan a hex sequence; return -1 if one could not be scanned + // + private int scanHexSeq(int start, int n) + throws URISyntaxException + { + int p = start; + int q; + + q = scan(p, n, L_HEX, H_HEX); + if (q <= p) + return -1; + if (at(q, n, '.')) // Beginning of IPv4 address + return -1; + if (q > p + 4) + fail("IPv6 hexadecimal digit sequence too long", p); + ipv6byteCount += 2; + p = q; + while (p < n) { + if (!at(p, n, ':')) + break; + if (at(p + 1, n, ':')) + break; // "::" + p++; + q = scan(p, n, L_HEX, H_HEX); + if (q <= p) + failExpecting("digits for an IPv6 address", p); + if (at(q, n, '.')) { // Beginning of IPv4 address + p--; + break; + } + if (q > p + 4) + fail("IPv6 hexadecimal digit sequence too long", p); + ipv6byteCount += 2; + p = q; + } + + return p; + } + + } + + /** + * Returns a {@link Builder} built from this IRI, with + * the given {@code capabilities} set. + * + * @apiNote + * This method can be used to easily change some components + * of an IRI - for instance: + *
+     *    IRI iri = ...;
+     *    IRI changed = iri.with(Builder.DEFAULT_CAPABILITY)
+     *          .scheme("https").build();
+ * will return a new IRI identical to the original IRI + * except that the scheme will have been substituted with + * {@code "https"}. + * + * @implSpec + * The new builder is populated with the components of this + * IRI. + *

The {@link Builder#QUOTE_ENCODED_CAPABILITY} capability, + * if set, only impacts components which are explicitly + * changed through the {@link Builder} API. + * In all cases, {@code iri.with(cap).build().equals(iri)}, + * where {@code cap} is either {@link Builder#DEFAULT_CAPABILITY} + * or {@link Builder#QUOTE_ENCODED_CAPABILITY}.

+ * + * @implNote + * This implementation also ensures that + * {@code iri.toString().equals(iri.with(cap).build().toString()}, + * where {@code cap} is either {@link Builder#DEFAULT_CAPABILITY} + * or {@link Builder#QUOTE_ENCODED_CAPABILITY}. + * + * @param capabilities The builder capability set. + * + * @return A new builder pre-populated with the components of + * this IRI, and with the given {@code capabilities}. + */ + public Builder with(int capabilities) { + return new Builder(this, capabilities); + } + + /** + * Creates a new {@link Builder}. + * @return A new IRI Builder. + */ + public static Builder newBuilder() { + return newBuilder(Builder.DEFAULT_CAPABILITY); + } + + /** + * Creates a new {@link Builder}, with the given capabilities. + * @param capabilities The new builder capability set. + * + * @implNote + * This implementation only supports the following + * capabilities: + *
    + *
  • {@link Builder#DEFAULT_CAPABILITY}: the default capabilities + * (empty set).
  • + *
  • {@link Builder#QUOTE_ENCODED_CAPABILITY}: the builder will + * automatically {@linkplain #quoteEncodedOctets(String) quote} + * percent-encoded octets present in input parameters - in the + * same manner that {@link java.net.URI} used to do.
  • + *
+ * @return A new IRI Builder. + */ + public static Builder newBuilder(int capabilities) { + return new Builder(capabilities); + } + + /** + * A component-wise builder of IRIs. + * + *

If desired, a builder can be created with a set of enhanced + * capabilities that may impact how input parameters are handled + * or how the IRI is eventually built. + * + *

Because full validation of the IRI can only be performed once all + * the components are known, then full validation of the URI syntax + * is delayed until the builder's {@link #build()} method is called. + * + * @apiNote + * Typical usage example: + *

+     *     IRI iri;
+     *     try {
+     *         iri = IRI.newBuilder()
+     *             .scheme("http")
+     *             .host("www.example.com")
+     *             .path("/sample")
+     *             .query("version=12")
+     *             .build();
+     *     } catch(URISyntaxException x) {
+     *         System.out.println("Failed to build IRI: " + x.getMessage());
+     *     }
+     * 
+ * + * @implNote + * This implementation only supports the following capabilities: + *
    + *
  • {@link Builder#DEFAULT_CAPABILITY}: the default capabilities + * (empty set).
  • + *
  • {@link Builder#QUOTE_ENCODED_CAPABILITY}: the builder will + * automatically {@linkplain #quoteEncodedOctets(String) re-quote} + * percent-encoded octets present in input parameters - in the + * same manner that {@link java.net.URI} used to do.
  • + *
+ * + * @since TBD + * + */ + public static final class Builder { + /** + * Represents the default capabilities (an empty set). + */ + public static final int DEFAULT_CAPABILITY = 0; + + /** + * Represents the capability of a builder to pre-process its input + * parameters by calling + * {@link #quoteEncodedOctets(String) quoteEncodedOctets}. + * This can be useful when migrating code that used to work with + * {@link java.net.URI java.net.URI}. + */ + public static final int QUOTE_ENCODED_CAPABILITY = 1; + + private String scheme, host, userinfo, path, opaque, query, fragment, authority; + private int port = -1; + private final int capabilities; + + // Creates a builder with the given set of capabilities. + Builder(int capabilities) { + this.capabilities = capabilities; + } + + // Creates a builder from the give IRI, with the given set of + // capabilities. + Builder(IRI iri, int capabilities) { + assert iri != null; + this.capabilities = capabilities; + // encoding in multi-args factories is idempotent, + // so we can use the raw form of the component to + // ensure that the string form of the IRI is + // preserved - that is, we want to have: + // iri.with(cap).build().toString().equals(iri.toString()) + scheme = iri.scheme; + query = iri.query; + fragment = iri.fragment; + if (iri.isOpaque()) { + this.opaque = iri.path; + } else { + host = iri.host; + userinfo = iri.userInfo; + path = iri.path; + port = iri.port; + String auth = iri.authority; + if (auth != null && auth.endsWith(":")) { + // this is a bit of a hack to preserve + // non-canonical authority forms + authority = iri.authority; + } + } + } + + /** + * Sets the IRI scheme component. + * + *

No validation is performed by this method, but supplying an + * invalid value may lead to an {@code URISyntaxException} when + * {@link #build()} is called. + * + * @param scheme The IRI scheme. May be {@code null}, in which case + * the previously defined scheme, if any, will be erased. + * @return this builder + */ + public Builder scheme(String scheme) { + this.scheme = scheme; + return this; + } + + /** + * Sets the IRI host component. + * + *

If this builder has the {@link #QUOTE_ENCODED_CAPABILITY}, + * and the provided value is not a literal address, then the + * provided {@code host} value will be pre-processed by calling + * {@link #quoteEncodedOctets(String) quoteEncodedOctets(host)} + * before storing it in this builder instance. + * + *

No validation is performed by this method, but supplying an + * invalid value may lead to an {@code URISyntaxException} when + * {@link #build()} is called. + * + * @param host The IRI host. May be {@code null}, in which case + * the previously defined host, if any, will be erased. + * If non-null, then any previously supplied + * {@link #authority(String) authority} will be + * erased. + * @return this builder + */ + public Builder host(String host) { + this.host = checkEncodeHost(host); + if (host != null) { + this.authority = null; + } + return this; + } + + /** + * Sets the IRI userinfo component. + * + *

If this builder has the {@link #QUOTE_ENCODED_CAPABILITY}, the + * provided {@code userinfo} value will be pre-processed by calling + * {@link #quoteEncodedOctets(String) quoteEncodedOctets(userinfo)} + * before storing it in this builder instance. + * + *

No validation is performed by this method, but supplying an + * invalid value may lead to an {@code URISyntaxException} when + * {@link #build()} is called. + * + * @param userinfo The IRI userinfo. May be {@code null}, in which case + * the previously defined userinfo, if any, will be erased. + * If non-null, then any previously supplied + * {@link #authority(String) authority} will be + * erased. + * @return this builder + */ + public Builder userinfo(String userinfo) { + this.userinfo = checkEncode(userinfo); + if (userinfo != null) { + authority = null; + } + return this; + } + + /** + * Sets the IRI port component. + * + *

No validation is performed by this method, but supplying an + * invalid value may lead to an {@code URISyntaxException} when + * {@link #build()} is called. + * + * @param port The IRI port. May be {@code -1}, in which case + * the previously specified port, if any, will be erased. + * If positive, then any previously supplied + * {@link #authority(String) authority} will be + * erased. + * @return this builder + */ + public Builder port(int port) { + this.port = port; + if (port != -1) { + authority = null; + } + return this; + } + + /** + * Sets the IRI authority component. + * + *

If this builder has the {@link #QUOTE_ENCODED_CAPABILITY}, the + * provided {@code authority} value will be pre-processed by calling + * {@link #quoteEncodedOctets(String) quoteEncodedOctets(authority)} + * before storing it in this builder instance. + * + *

No validation is performed by this method, but supplying an + * invalid value may lead to an {@code URISyntaxException} when + * {@link #build()} is called. + * + * @param authority The IRI pre-composed authority. + * May be {@code null}, in which case the previously + * specified {@link #authority(String) authority}, if any, + * will be erased. If non-null, then any previously + * supplied {@link #host(String) host}, {@link + * #userinfo userinfo}, and {@link #port port} will be + * erased. + * @return this builder + */ + public Builder authority(String authority) { + this.authority = checkEncode(authority); + if (authority != null) { + this.host = null; + this.port = -1; + this.userinfo = null; + } + return this; + } + + /** + * Sets the IRI path component. + * + *

If this builder has the {@link #QUOTE_ENCODED_CAPABILITY}, the + * provided {@code path} value will be pre-processed by calling + * {@link #quoteEncodedOctets(String) quoteEncodedOctets(path)} + * before storing it in this builder instance. + * + *

No validation is performed by this method, but supplying an + * invalid value may lead to an {@code URISyntaxException} when + * {@link #build()} is called. + * + * @param path The IRI path. May be {@code null}, in which case + * the previously defined path, if any, will be erased. + * @return this builder + */ + public Builder path(String path) { + this.path = checkEncode(path); + if (path != null) { + this.opaque = null; + } + return this; + } + + /** + * Sets the IRI path component to an opaque path. + * + *

If this builder has the {@link #QUOTE_ENCODED_CAPABILITY}, the + * provided {@code path} value will be pre-processed by calling + * {@link #quoteEncodedOctets(String) quoteEncodedOctets(path)} + * before storing it in this builder instance. + * + *

No validation is performed by this method, but supplying an + * invalid value may lead to an {@code URISyntaxException} when + * {@link #build()} is called. + * + * @param opaque The IRI opaque path. May be {@code null}, in which case + * the previously defined opaque path, if any, will be erased. + * If non-null, then any previously + * supplied {@link #path(String) path}, {@link + * #host(String) host}, {@link + * #userinfo userinfo}, {@link #port port} and + * {@link #authority(String) authority} will be erased. + * @return this builder + */ + public Builder opaque(String opaque) { + this.opaque = checkEncode(opaque); + if (opaque != null) { + this.path = this.authority = this.host = this.userinfo = null; + this.port = -1; + } + return this; + } + + /** + * Sets the IRI query component. + * + *

If this builder has the {@link #QUOTE_ENCODED_CAPABILITY}, the + * provided {@code query} value will be pre-processed by calling + * {@link #quoteEncodedOctets(String) quoteEncodedOctets(query)} + * before storing it in this builder instance. + * + *

No validation is performed by this method, but supplying an + * invalid value may lead to an {@code URISyntaxException} when + * {@link #build()} is called. + * + * @param query The IRI query. May be {@code null}, in which case + * any previously defined query will be erased. + * @return this builder + */ + public Builder query(String query) { + this.query = checkEncode(query); + return this; + } + + /** + * Sets the IRI fragment component. + * + *

If this builder has the {@link #QUOTE_ENCODED_CAPABILITY}, the + * provided {@code fragment} value will be pre-processed by calling + * {@link #quoteEncodedOctets(String) quoteEncodedOctets(fragment)} + * before storing it in this builder instance. + * + *

No validation is performed by this method, but supplying an + * invalid value may lead to an {@code URISyntaxException} when + * {@link #build()} is called. + * + * @param fragment The IRI query. May be {@code null}, in which case + * the previously defined fragment, if, any, will be erased. + * @return this builder + */ + public Builder fragment(String fragment) { + this.fragment = checkEncode(fragment); + return this; + } + + /** + * Builds an IRI from the components stored in this builder. + * + *

The IRI components will be encoded as specified for the + * {@link #createHierarchical(String, String, String, String, String) + * IRI.createHierarchical(scheme, authority, path, query, fragment)} method, + * if a composed {@linkplain #authority(String) authority} component was + * explicitly supplied, or by the + * {@link #createOpaque(String, String, String, String) + * URI.createOpaque(scheme, opaque, query, fragment)} method, if an {@linkplain + * #opaque(String) opaque path} component was explicitly supplied, or by the + * {@link #createHierarchical(String, String, String, int, String, String, String) + * IRI.createHierarchical(scheme, userinfo, host, port, path, query, fragment)} + * method, otherwise. + * + * @return A new IRI built from the specified components. + * @throws URISyntaxException If the resulting IRI is invalid and + * could not be constructed. + * @throws IllegalArgumentException If the provided components are not + * consistent. For instance, + * if an opaque path was provided with no scheme; + * or if both a scheme and a path are given but the path is relative; + * or if a userinfo or port was provided with no host; + * or if no authority or host is provided and the {@code path} + * component starts with {@code "//"} or contains {@code ':'} before + * the first {@code '/'}; + * or if a host or authority are provided without a scheme, but with a + * path which is not empty and does not start with {@code '/'}. + * + * @see #buildUnchecked() + */ + public IRI build() throws URISyntaxException { + if (opaque != null) { + return IRI.createOpaque(scheme, opaque, query, fragment); + } else if (authority == null) { + return IRI.createHierarchical(scheme, userinfo, host, port, path, query, fragment); + } else { + // host and userinfo may be non-null if this builder was seeded with + // an IRI whose authority component ended with ':' + assert authority.endsWith(":") || host == null && userinfo == null; + assert port == -1; + return IRI.createHierarchical(scheme, authority, path, query, fragment); + } + } + + /** + * Builds an IRI from the components stored in this builder. + * + *

This convenience factory method works as if by invoking the + * {@link #build() build} method; any {@link URISyntaxException} thrown + * by {@code build} is caught and wrapped in a new + * {@link IllegalArgumentException} instance, which is then thrown. + * + * @apiNote + *

This method is provided for use in situations where it is known + * that the given components form a legal IRI, for example with constant + * IRI literals declared within a program, and so it would be considered + * a programming error for the IRI not to parse as such. The + * {@code build} method, which throws {@link URISyntaxException} + * directly, should be used in situations where an IRI is being created + * from user input or from some other source that may be prone to + * errors. + * + * @return A new IRI built from the specified components. + * @throws IllegalArgumentException If the resulting IRI is invalid and + * could not be constructed. + */ + public IRI buildUnchecked() { + try { + return build(); + } catch (URISyntaxException x) { + throw new IllegalArgumentException(x.getMessage(), x); + } + } + + private String checkEncode(String param) { + if (param == null) return null; + return (capabilities & QUOTE_ENCODED_CAPABILITY) == 0 + ? param : quoteEncodedOctets(param); + } + + private String checkEncodeHost(String host) { + if (host == null) return null; + if ((capabilities & QUOTE_ENCODED_CAPABILITY) == 0) + return host; + HostType type = getHostType(host, false); + return type.isLiteral() ? host : quoteEncodedOctets(host); + } + + } + + /** + * Returns a {@link java.net.URI URI} equivalent to this IRI. + * + * @apiNote + * + * This method is provided to ease migration and + * retain compatibility with those APIs that only + * accept {@link java.net.URI java.net.URI} instances. + * + *

Because {@code java.net.URI} supports an older version + * of the Uniform Resource Identifier: Generic Syntax + * RFC, then not all IRIs might be converted into URIs. + * For instance, IRIs of the form {@code "about:"} are not + * parsable by {@code java.net.URI}. In that case, this + * method will throw a {@code URISyntaxException}.

+ * + *

This implementation will additionally remove the superfluous + * colon at the end of the authority component, if the raw authority + * component ends with {@code ':'}. + * Thus, an IRI whose {@linkplain #toString() string form} is + * {@code "file://:/path"} (empty host, empty port), will be converted + * into a {@code java.net.URI} whose {@linkplain URI#toString() + * string form} is {@code "file:///path"}.

+ * + * @return A {@code java.net.URI} instance equivalent to this IRI. + * + * @throws URISyntaxException if this IRI cannot be converted into a + * {@code java.net.URI} instance. + */ + public URI toURI() throws URISyntaxException { + String str; + if (authority != null && authority.endsWith(":")) { + str = buildString(new StringBuilder(), + scheme, + userInfo, host, port, + path, query, fragment).toString(); + } else { + str = defineString(); + } + return new URI(str); + } + +// static { +// SharedSecrets.setJavaNetUriAccess( +// new JavaNetUriAccess() { +// public IRI create(String scheme, String path) { +// return new IRI(scheme, path); +// } +// } +// ); +// } +} diff --git a/net-resource/src/main/java/org/xbib/net/resource/ResourceIdentifier.java b/net-resource/src/main/java/org/xbib/net/resource/ResourceIdentifier.java new file mode 100644 index 0000000..9a3ceb2 --- /dev/null +++ b/net-resource/src/main/java/org/xbib/net/resource/ResourceIdentifier.java @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package org.xbib.net.resource; + +/** + * Represents an abstract Resource Identifier reference. + * + *

The particular conformance with any standards, parsing and construction + * behaviour, and operational aspects of the Resource Identifier are specified + * by a concrete subtype. This class does not have a public accessible + * constructor, therefore the only valid subtypes are those that are defined + * by the Java Platform. The only known subtypes are {@link URI} ( that conforms + * to the obsoleted RFC 2396) and {@link IRI} ( that conforms to the + * more recent RFC 3986 / STD 66 and RFC 3987 ). + * + *

The components of the Resource Identifier are retrievable through the + * methods of this class. Both the raw and decoded forms of the components are + * retrievable. The raw form of a component is the value of the component + * without any interpretation or conversation. The decoded form of a component + * is the value of the raw form after a single decoding pass that decodes UTF-8 + * percent-encoded triplets. A concrete subtype will further define the specific + * set of allowable characters within each component. + * + */ +public abstract class ResourceIdentifier { + + /* package-private */ ResourceIdentifier() { } + + /** + * Tells whether or not this resource identifier is absolute. + * + *

A resource identifier is absolute if, and only if, it has a + * scheme component. + * + * @return {@code true} if, and only if, this resource identifier + * is absolute + */ + public abstract boolean isAbsolute(); + + /** + * Tells whether or not this resource identifier is considered opaque. + * + * @return {@code true} if, and only if, this resource identifier + * is considered opaque + */ + public abstract boolean isOpaque(); + + /** + * Returns the scheme component of this resource identifier. + * + * @return The scheme component of this resource identifier, + * or {@code null} if the scheme is undefined + */ + public abstract String getScheme(); + + /** + * Returns the raw authority component of this resource identifier. + * + * @return The raw authority component of this resource identifier, + * or {@code null} if the authority is undefined + */ + public abstract String getRawAuthority(); + + /** + * Returns the decoded authority component of this resource identifier. + * + * @return The decoded authority component of this resource identifier, + * or {@code null} if the authority is undefined + */ + public abstract String getAuthority(); + + /** + * Returns the raw user-information component of this resource identifier. + * + * @return The raw user-information component of this resource identifier, + * or {@code null} if the user information is undefined + */ + public abstract String getRawUserInfo(); + + /** + * Returns the decoded user-information component of this resource identifier. + * + * @return The decoded user-information component of this resource identifier, + * or {@code null} if the user information is undefined + */ + public abstract String getUserInfo(); + + /** + * Returns the host component of this resource identifier. + * + * @return The host component of this resource identifier, + * or {@code null} if the host is undefined, or does + * not parse as a syntactically valid internet name. + */ + public abstract String getHost(); + + /** + * Returns the port number of this resource identifier. + * + * @return The port component of this resource identifier, + * or {@code -1} if the port is undefined + */ + public abstract int getPort(); + + /** + * Returns the raw path component of this resource identifier. + * + * @apiNote + * + * Different subclasses may return {@code null} on different + * conditions. For instance {@code java.net.URI} will always + * return {@code null} if the URI is opaque, while {@code + * java.net.IRI} will simply return the opaque path. + * + * @return The path component of this resource identifier, + * or {@code null} if the path is undefined + */ + public abstract String getRawPath(); + + /** + * Returns the decoded path component of this resource identifier. + * + * @apiNote + * + * Different subclasses may return {@code null} on different + * conditions. For instance {@code java.net.URI} will always + * return {@code null} if the URI is opaque, while {@code + * java.net.IRI} will simply return the decoded opaque path. + * + * @return The decoded path component of this resource identifier, + * or {@code null} if the path is undefined + */ + public abstract String getPath(); + + /** + * Returns the raw query component of this resource identifier. + * + * @return The raw query component of this resource identifier, + * or {@code null} if the query is undefined + */ + public abstract String getRawQuery(); + + /** + * Returns the decoded query component of this resource identifier. + * + * @apiNote + * + * Different subclasses may return {@code null} on different + * conditions. For instance {@code java.net.URI} will always + * return {@code null} if the URI is opaque. + * + * @return The decoded query component of this resource identifier, + * or {@code null} if the query is undefined + */ + public abstract String getQuery(); + + /** + * Returns the raw fragment component of this resource identifier. + * + * @return The raw fragment component of this resource identifier, + * or {@code null} if the fragment is undefined + */ + public abstract String getRawFragment(); + + /** + * Returns the decoded fragment component of this resource identifier. + * + * @return The decoded fragment component of this URresource identifierI, + * or {@code null} if the fragment is undefined + */ + public abstract String getFragment(); + + /** + * Returns the content of this resource identifier as a US-ASCII string. + * + * @return The string form of this resource identifier, encoded as needed + * so that it only contains characters in the US-ASCII charset + */ + public abstract String toASCIIString(); + +} diff --git a/net-resource/src/main/java/org/xbib/net/resource/ThreadLocalCoders.java b/net-resource/src/main/java/org/xbib/net/resource/ThreadLocalCoders.java new file mode 100644 index 0000000..dacfccd --- /dev/null +++ b/net-resource/src/main/java/org/xbib/net/resource/ThreadLocalCoders.java @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + + +package org.xbib.net.resource; + +import java.nio.charset.*; + + +/** + * Utility class for caching per-thread decoders and encoders. + */ + +public class ThreadLocalCoders { + + private static final int CACHE_SIZE = 3; + + private static abstract class Cache { + + // Thread-local reference to array of cached objects, in LRU order + private final ThreadLocal cache = new ThreadLocal<>(); + private final int size; + + Cache(int size) { + this.size = size; + } + + abstract Object create(Object name); + + private void moveToFront(Object[] oa, int i) { + Object ob = oa[i]; + for (int j = i; j > 0; j--) + oa[j] = oa[j - 1]; + oa[0] = ob; + } + + abstract boolean hasName(Object ob, Object name); + + Object forName(Object name) { + Object[] oa = cache.get(); + if (oa == null) { + oa = new Object[size]; + cache.set(oa); + } else { + for (int i = 0; i < oa.length; i++) { + Object ob = oa[i]; + if (ob == null) + continue; + if (hasName(ob, name)) { + if (i > 0) + moveToFront(oa, i); + return ob; + } + } + } + + // Create a new object + Object ob = create(name); + oa[oa.length - 1] = ob; + moveToFront(oa, oa.length - 1); + return ob; + } + + } + + private static Cache decoderCache = new Cache(CACHE_SIZE) { + boolean hasName(Object ob, Object name) { + if (name instanceof String) + return (((CharsetDecoder)ob).charset().name().equals(name)); + if (name instanceof Charset) + return ((CharsetDecoder)ob).charset().equals(name); + return false; + } + Object create(Object name) { + if (name instanceof String) + return Charset.forName((String)name).newDecoder(); + if (name instanceof Charset) + return ((Charset)name).newDecoder(); + assert false; + return null; + } + }; + + public static CharsetDecoder decoderFor(Object name) { + CharsetDecoder cd = (CharsetDecoder)decoderCache.forName(name); + cd.reset(); + return cd; + } + + private static Cache encoderCache = new Cache(CACHE_SIZE) { + boolean hasName(Object ob, Object name) { + if (name instanceof String) + return (((CharsetEncoder)ob).charset().name().equals(name)); + if (name instanceof Charset) + return ((CharsetEncoder)ob).charset().equals(name); + return false; + } + Object create(Object name) { + if (name instanceof String) + return Charset.forName((String)name).newEncoder(); + if (name instanceof Charset) + return ((Charset)name).newEncoder(); + assert false; + return null; + } + }; + + public static CharsetEncoder encoderFor(Object name) { + CharsetEncoder ce = (CharsetEncoder)encoderCache.forName(name); + ce.reset(); + return ce; + } + +} diff --git a/net-resource/src/main/java/org/xbib/net/resource/URI.java b/net-resource/src/main/java/org/xbib/net/resource/URI.java new file mode 100644 index 0000000..e12e71a --- /dev/null +++ b/net-resource/src/main/java/org/xbib/net/resource/URI.java @@ -0,0 +1,3726 @@ +/* + * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package org.xbib.net.resource; + +import java.io.File; +import java.io.IOException; +import java.io.InvalidObjectException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.net.MalformedURLException; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.CharacterCodingException; +import java.nio.file.Path; +import java.text.Normalizer; + +import java.lang.Character; // for javadoc +import java.lang.NullPointerException; // for javadoc + + +/** + * Represents a Uniform Resource Identifier (URI) reference as defined + * by the obsolete RFC 2396. + * + *

That specification has been superceded by RFCs 3986 and 3987 which defines + * URIs and IRIs. + * Both of these entities are now represented by the on class {@link IRI} + * which is preferred over this one. + * + *

Aside from some minor deviations noted below, an instance of this + * class represents a URI reference as defined by + * RFC 2396: Uniform + * Resource Identifiers (URI): Generic Syntax, amended by RFC 2732: Format for + * Literal IPv6 Addresses in URLs. The Literal IPv6 address format + * also supports scope_ids. The syntax and usage of scope_ids is described + * here. + * This class provides constructors for creating URI instances from + * their components or by parsing their string forms, methods for accessing the + * various components of an instance, and methods for normalizing, resolving, + * and relativizing URI instances. Instances of this class are immutable. + * + * + *

URI syntax and components

+ * + * At the highest level a URI reference (hereinafter simply "URI") in string + * form has the syntax + * + *
+ * [scheme{@code :}]scheme-specific-part[{@code #}fragment] + *
+ * + * where square brackets [...] delineate optional components and the characters + * {@code :} and {@code #} stand for themselves. + * + *

An absolute URI specifies a scheme; a URI that is not absolute is + * said to be relative. URIs are also classified according to whether + * they are opaque or hierarchical. + * + *

An opaque URI is an absolute URI whose scheme-specific part does + * not begin with a slash character ({@code '/'}). Opaque URIs are not + * subject to further parsing. Some examples of opaque URIs are: + * + *

    + *
  • {@code mailto:java-net@www.example.com}
  • + *
  • {@code news:comp.lang.java}
  • + *
  • {@code urn:isbn:096139210x}
  • + *
+ * + *

A hierarchical URI is either an absolute URI whose + * scheme-specific part begins with a slash character, or a relative URI, that + * is, a URI that does not specify a scheme. Some examples of hierarchical + * URIs are: + * + *

+ * {@code http://example.com/languages/java/}
+ * {@code sample/a/index.html#28}
+ * {@code ../../demo/b/index.html}
+ * {@code file:///~/calendar} + *
+ * + *

A hierarchical URI is subject to further parsing according to the syntax + * + *

+ * [scheme{@code :}][{@code //}authority][path][{@code ?}query][{@code #}fragment] + *
+ * + * where the characters {@code :}, {@code /}, + * {@code ?}, and {@code #} stand for themselves. The + * scheme-specific part of a hierarchical URI consists of the characters + * between the scheme and fragment components. + * + *

The authority component of a hierarchical URI is, if specified, either + * server-based or registry-based. A server-based authority + * parses according to the familiar syntax + * + *

+ * [user-info{@code @}]host[{@code :}port] + *
+ * + * where the characters {@code @} and {@code :} stand for + * themselves. Nearly all URI schemes currently in use are server-based. An + * authority component that does not parse in this way is considered to be + * registry-based. + * + *

The path component of a hierarchical URI is itself said to be absolute + * if it begins with a slash character ({@code '/'}); otherwise it is + * relative. The path of a hierarchical URI that is either absolute or + * specifies an authority is always absolute. + * + *

All told, then, a URI instance has the following nine components: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment
ComponentType
scheme{@code String}
scheme-specific-part{@code String}
authority{@code String}
user-info{@code String}
host{@code String}
port{@code int}
path{@code String}
query{@code String}
fragment{@code String}
+ * + * In a given instance any particular component is either undefined or + * defined with a distinct value. Undefined string components are + * represented by {@code null}, while undefined integer components are + * represented by {@code -1}. A string component may be defined to have the + * empty string as its value; this is not equivalent to that component being + * undefined. + * + *

Whether a particular component is or is not defined in an instance + * depends upon the type of the URI being represented. An absolute URI has a + * scheme component. An opaque URI has a scheme, a scheme-specific part, and + * possibly a fragment, but has no other components. A hierarchical URI always + * has a path (though it may be empty) and a scheme-specific-part (which at + * least contains the path), and may have any of the other components. If the + * authority component is present and is server-based then the host component + * will be defined and the user-information and port components may be defined. + * + * + *

Operations on URI instances

+ * + * The key operations supported by this class are those of + * normalization, resolution, and relativization. + * + *

Normalization is the process of removing unnecessary {@code "."} + * and {@code ".."} segments from the path component of a hierarchical URI. + * Each {@code "."} segment is simply removed. A {@code ".."} segment is + * removed only if it is preceded by a non-{@code ".."} segment. + * Normalization has no effect upon opaque URIs. + * + *

Resolution is the process of resolving one URI against another, + * base URI. The resulting URI is constructed from components of both + * URIs in the manner specified by RFC 2396, taking components from the + * base URI for those not specified in the original. For hierarchical URIs, + * the path of the original is resolved against the path of the base and then + * normalized. The result, for example, of resolving + * + *

+ * {@code sample/a/index.html#28} + *              + *     (1) + *
+ * + * against the base URI {@code http://example.com/languages/java/} is the result + * URI + * + *
+ * {@code http://example.com/languages/java/sample/a/index.html#28} + *
+ * + * Resolving the relative URI + * + *
+ * {@code ../../demo/b/index.html}    (2) + *
+ * + * against this result yields, in turn, + * + *
+ * {@code http://example.com/languages/java/demo/b/index.html} + *
+ * + * Resolution of both absolute and relative URIs, and of both absolute and + * relative paths in the case of hierarchical URIs, is supported. Resolving + * the URI {@code file:///~calendar} against any other URI simply yields the + * original URI, since it is absolute. Resolving the relative URI (2) above + * against the relative base URI (1) yields the normalized, but still relative, + * URI + * + *
+ * {@code demo/b/index.html} + *
+ * + *

Relativization, finally, is the inverse of resolution: For any + * two normalized URIs u and v, + * + *

+ * u{@code .relativize(}u{@code .resolve(}v{@code )).equals(}v{@code )}  and
+ * u{@code .resolve(}u{@code .relativize(}v{@code )).equals(}v{@code )}  .
+ *
+ * + * This operation is often useful when constructing a document containing URIs + * that must be made relative to the base URI of the document wherever + * possible. For example, relativizing the URI + * + *
+ * {@code http://example.com/languages/java/sample/a/index.html#28} + *
+ * + * against the base URI + * + *
+ * {@code http://example.com/languages/java/} + *
+ * + * yields the relative URI {@code sample/a/index.html#28}. + * + * + *

Character categories

+ * + * RFC 2396 specifies precisely which characters are permitted in the + * various components of a URI reference. The following categories, most of + * which are taken from that specification, are used below to describe these + * constraints: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other
CategoryDescription
alphaThe US-ASCII alphabetic characters, + * {@code 'A'} through {@code 'Z'} + * and {@code 'a'} through {@code 'z'}
digitThe US-ASCII decimal digit characters, + * {@code '0'} through {@code '9'}
alphanumAll alpha and digit characters
unreservedAll alphanum characters together with those in the string + * {@code "_-!.~'()*"}
punctThe characters in the string {@code ",;:$&+="}
reservedAll punct characters together with those in the string + * {@code "?/[]@"}
escapedEscaped octets, that is, triplets consisting of the percent + * character ({@code '%'}) followed by two hexadecimal digits + * ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and + * {@code 'a'}-{@code 'f'})
otherThe Unicode characters that are not in the US-ASCII character set, + * are not control characters (according to the {@link + * java.lang.Character#isISOControl(char) Character.isISOControl} + * method), and are not space characters (according to the {@link + * java.lang.Character#isSpaceChar(char) Character.isSpaceChar} + * method)  (Deviation from RFC 2396, which is + * limited to US-ASCII)
+ * + *

The set of all legal URI characters consists of + * the unreserved, reserved, escaped, and other + * characters. + * + * + *

Escaped octets, quotation, encoding, and decoding

+ * + * RFC 2396 allows escaped octets to appear in the user-info, path, query, and + * fragment components. Escaping serves two purposes in URIs: + * + *
    + * + *
  • To encode non-US-ASCII characters when a URI is required to + * conform strictly to RFC 2396 by not containing any other + * characters.

  • + * + *
  • To quote characters that are otherwise illegal in a + * component. The user-info, path, query, and fragment components differ + * slightly in terms of which characters are considered legal and illegal. + *

  • + * + *
+ * + * These purposes are served in this class by three related operations: + * + *
    + * + *
  • A character is encoded by replacing it + * with the sequence of escaped octets that represent that character in the + * UTF-8 character set. The Euro currency symbol ({@code '\u005Cu20AC'}), + * for example, is encoded as {@code "%E2%82%AC"}. (Deviation from + * RFC 2396, which does not specify any particular character + * set.)

  • + * + *
  • An illegal character is quoted simply by + * encoding it. The space character, for example, is quoted by replacing it + * with {@code "%20"}. UTF-8 contains US-ASCII, hence for US-ASCII + * characters this transformation has exactly the effect required by + * RFC 2396.

  • + * + *
  • + * A sequence of escaped octets is decoded by + * replacing it with the sequence of characters that it represents in the + * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the + * effect of de-quoting any quoted US-ASCII characters as well as that of + * decoding any encoded non-US-ASCII characters. If a decoding error occurs + * when decoding the escaped octets then the erroneous octets are replaced by + * {@code '\u005CuFFFD'}, the Unicode replacement character.

  • + * + *
+ * + * These operations are exposed in the constructors and methods of this class + * as follows: + * + *
    + * + *
  • The {@linkplain #URI(java.lang.String) single-argument + * constructor} requires any illegal characters in its argument to be + * quoted and preserves any escaped octets and other characters that + * are present.

  • + * + *
  • The {@linkplain + * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String) + * multi-argument constructors} quote illegal characters as + * required by the components in which they appear. The percent character + * ({@code '%'}) is always quoted by these constructors. Any other + * characters are preserved.

  • + * + *
  • The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath() + * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment() + * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link + * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the + * values of their corresponding components in raw form, without interpreting + * any escaped octets. The strings returned by these methods may contain + * both escaped octets and other characters, and will not contain any + * illegal characters.

  • + * + *
  • The {@link #getUserInfo() getUserInfo}, {@link #getPath() + * getPath}, {@link #getQuery() getQuery}, {@link #getFragment() + * getFragment}, {@link #getAuthority() getAuthority}, and {@link + * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped + * octets in their corresponding components. The strings returned by these + * methods may contain both other characters and illegal characters, + * and will not contain any escaped octets.

  • + * + *
  • The {@link #toString() toString} method returns a URI string with + * all necessary quotation but which may contain other characters. + *

  • + * + *
  • The {@link #toASCIIString() toASCIIString} method returns a fully + * quoted and encoded URI string that does not contain any other + * characters.

  • + * + *
+ * + * + *

Identities

+ * + * For any URI u, it is always the case that + * + *
+ * {@code new URI(}u{@code .toString()).equals(}u{@code )} . + *
+ * + * For any URI u that does not contain redundant syntax such as two + * slashes before an empty authority (as in {@code file:///tmp/} ) or a + * colon following a host name but no port (as in + * {@code http://www.example.com:} ), and that does not encode characters + * except those that must be quoted, the following identities also hold: + *
+ *     new URI(u.getScheme(),
+ *             u.getSchemeSpecificPart(),
+ *             u.getFragment())
+ *     .equals(u)
+ * in all cases, + *
+ *     new URI(u.getScheme(),
+ *             u.getAuthority(),
+ *             u.getPath(), u.getQuery(),
+ *             u.getFragment())
+ *     .equals(u)
+ * if u is hierarchical, and + *
+ *     new URI(u.getScheme(),
+ *             u.getUserInfo(), u.getHost(), u.getPort(),
+ *             u.getPath(), u.getQuery(),
+ *             u.getFragment())
+ *     .equals(u)
+ * if u is hierarchical and has either no authority or a server-based + * authority. + * + * + *

URIs, URLs, and URNs

+ * + * A URI is a uniform resource identifier while a URL is a uniform + * resource locator. Hence every URL is a URI, abstractly speaking, but + * not every URI is a URL. This is because there is another subcategory of + * URIs, uniform resource names (URNs), which name resources but do not + * specify how to locate them. The {@code mailto}, {@code news}, and + * {@code isbn} URIs shown above are examples of URNs. + * + *

The conceptual distinction between URIs and URLs is reflected in the + * differences between this class and the {@link URL} class. + * + *

An instance of this class represents a URI reference in the syntactic + * sense defined by RFC 2396. A URI may be either absolute or relative. + * A URI string is parsed according to the generic syntax without regard to the + * scheme, if any, that it specifies. No lookup of the host, if any, is + * performed, and no scheme-dependent stream handler is constructed. Equality, + * hashing, and comparison are defined strictly in terms of the character + * content of the instance. In other words, a URI instance is little more than + * a structured string that supports the syntactic, scheme-independent + * operations of comparison, normalization, resolution, and relativization. + * + *

An instance of the {@link URL} class, by contrast, represents the + * syntactic components of a URL together with some of the information required + * to access the resource that it describes. A URL must be absolute, that is, + * it must always specify a scheme. A URL string is parsed according to its + * scheme. A stream handler is always established for a URL, and in fact it is + * impossible to create a URL instance for a scheme for which no handler is + * available. Equality and hashing depend upon both the scheme and the + * Internet address of the host, if any; comparison is not defined. In other + * words, a URL is a structured string that supports the syntactic operation of + * resolution as well as the network I/O operations of looking up the host and + * opening a connection to the specified resource. + * + * @apiNote + * + * Applications working with file paths and file URIs should take great + * care to use the appropriate methods to convert between the two. + * The {@link Path#of(URI)} factory method and the {@link File#File(URI)} + * constructor can be used to create {@link Path} or {@link File} + * objects from a file URI. {@link Path#toUri()} and {@link File#toURI()} + * can be used to create a {@link URI} from a file path. + * Applications should never try to {@linkplain + * #URI(String, String, String, int, String, String, String) + * construct}, {@linkplain #URI(String) parse}, or + * {@linkplain #resolve(String) resolve} a {@code URI} + * from the direct string representation of a {@code File} or {@code Path} + * instance. + *

+ * Some components of a URL or URI, such as userinfo, may + * be abused to construct misleading URLs or URIs. Applications + * that deal with URLs or URIs should take into account + * the recommendations advised in RFC3986, + * Section 7, Security Considerations. + * + * @author Mark Reinhold + * @since 1.4 + * + * @see RFC 2279: UTF-8, a + * transformation format of ISO 10646,
RFC 2373: IPv6 Addressing + * Architecture,
RFC 2396: Uniform + * Resource Identifiers (URI): Generic Syntax,
RFC 2732: Format for + * Literal IPv6 Addresses in URLs,
URISyntaxException + */ + +public final class URI extends ResourceIdentifier + implements Comparable, Serializable +{ + + // Note: Comments containing the word "ASSERT" indicate places where a + // throw of an InternalError should be replaced by an appropriate assertion + // statement once asserts are enabled in the build. + + static final long serialVersionUID = -6052424284110960213L; + + + // -- Properties and components of this instance -- + + // Components of all URIs: [:][#] + private transient String scheme; // null ==> relative URI + private transient String fragment; + + // Hierarchical URI components: [//][?] + private transient String authority; // Registry or server + + // Server-based authority: [@][:] + private transient String userInfo; + private transient String host; // null ==> registry-based + private transient int port = -1; // -1 ==> undefined + + // Remaining components of hierarchical URIs + private transient String path; // null ==> opaque + private transient String query; + + // The remaining fields may be computed on demand, which is safe even in + // the face of multiple threads racing to initialize them + private transient String schemeSpecificPart; + private transient int hash; // Zero ==> undefined + + private transient String decodedUserInfo; + private transient String decodedAuthority; + private transient String decodedPath; + private transient String decodedQuery; + private transient String decodedFragment; + private transient String decodedSchemeSpecificPart; + + /** + * The string form of this URI. + * + * @serial + */ + private volatile String string; // The only serializable field + + + + // -- Constructors and factories -- + + private URI() { } // Used internally + + /** + * Constructs a URI by parsing the given string. + * + *

This constructor parses the given string exactly as specified by the + * grammar in RFC 2396, + * Appendix A, except for the following deviations:

+ * + *
    + * + *
  • An empty authority component is permitted as long as it is + * followed by a non-empty path, a query component, or a fragment + * component. This allows the parsing of URIs such as + * {@code "file:///foo/bar"}, which seems to be the intent of + * RFC 2396 although the grammar does not permit it. If the + * authority component is empty then the user-information, host, and port + * components are undefined.

  • + * + *
  • Empty relative paths are permitted; this seems to be the + * intent of RFC 2396 although the grammar does not permit it. The + * primary consequence of this deviation is that a standalone fragment + * such as {@code "#foo"} parses as a relative URI with an empty path + * and the given fragment, and can be usefully resolved against a base URI. + * + *

  • IPv4 addresses in host components are parsed rigorously, as + * specified by RFC 2732: Each + * element of a dotted-quad address must contain no more than three + * decimal digits. Each element is further constrained to have a value + * no greater than 255.

  • + * + *
  • Hostnames in host components that comprise only a single + * domain label are permitted to start with an alphanum + * character. This seems to be the intent of RFC 2396 + * section 3.2.2 although the grammar does not permit it. The + * consequence of this deviation is that the authority component of a + * hierarchical URI such as {@code s://123}, will parse as a server-based + * authority.

  • + * + *
  • IPv6 addresses are permitted for the host component. An IPv6 + * address must be enclosed in square brackets ({@code '['} and + * {@code ']'}) as specified by RFC 2732. The + * IPv6 address itself must parse according to RFC 2373. IPv6 + * addresses are further constrained to describe no more than sixteen + * bytes of address information, a constraint implicit in RFC 2373 + * but not expressible in the grammar.

  • + * + *
  • Characters in the other category are permitted wherever + * RFC 2396 permits escaped octets, that is, in the + * user-information, path, query, and fragment components, as well as in + * the authority component if the authority is registry-based. This + * allows URIs to contain Unicode characters beyond those in the US-ASCII + * character set.

  • + * + *
+ * + * @param str The string to be parsed into a URI + * + * @throws NullPointerException + * If {@code str} is {@code null} + * + * @throws URISyntaxException + * If the given string violates RFC 2396, as augmented + * by the above deviations + */ + public URI(String str) throws URISyntaxException { + new Parser(str).parse(false); + } + + /** + * Constructs a hierarchical URI from the given components. + * + *

If a scheme is given then the path, if also given, must either be + * empty or begin with a slash character ({@code '/'}). Otherwise a + * component of the new URI may be left undefined by passing {@code null} + * for the corresponding parameter or, in the case of the {@code port} + * parameter, by passing {@code -1}. + * + *

This constructor first builds a URI string from the given components + * according to the rules specified in RFC 2396, + * section 5.2, step 7:

+ * + *
    + * + *
  1. Initially, the result string is empty.

  2. + * + *
  3. If a scheme is given then it is appended to the result, + * followed by a colon character ({@code ':'}).

  4. + * + *
  5. If user information, a host, or a port are given then the + * string {@code "//"} is appended.

  6. + * + *
  7. If user information is given then it is appended, followed by + * a commercial-at character ({@code '@'}). Any character not in the + * unreserved, punct, escaped, or other + * categories is quoted.

  8. + * + *
  9. If a host is given then it is appended. If the host is a + * literal IPv6 address but is not enclosed in square brackets + * ({@code '['} and {@code ']'}) then the square brackets are added. + *

  10. + * + *
  11. If a port number is given then a colon character + * ({@code ':'}) is appended, followed by the port number in decimal. + *

  12. + * + *
  13. If a path is given then it is appended. Any character not in + * the unreserved, punct, escaped, or other + * categories, and not equal to the slash character ({@code '/'}) or the + * commercial-at character ({@code '@'}), is quoted.

  14. + * + *
  15. If a query is given then a question-mark character + * ({@code '?'}) is appended, followed by the query. Any character that + * is not a legal URI character is quoted. + *

  16. + * + *
  17. Finally, if a fragment is given then a hash character + * ({@code '#'}) is appended, followed by the fragment. Any character + * that is not a legal URI character is quoted.

  18. + * + *
+ * + *

The resulting URI string is then parsed as if by invoking the {@link + * #URI(String)} constructor and then invoking the {@link + * #parseServerAuthority()} method upon the result; this may cause a {@link + * URISyntaxException} to be thrown.

+ * + * @param scheme Scheme name + * @param userInfo User name and authorization information + * @param host Host name + * @param port Port number + * @param path Path + * @param query Query + * @param fragment Fragment + * + * @throws URISyntaxException + * If both a scheme and a path are given but the path is relative, + * if the URI string constructed from the given components violates + * RFC 2396, or if the authority component of the string is + * present but cannot be parsed as a server-based authority + */ + public URI(String scheme, + String userInfo, String host, int port, + String path, String query, String fragment) + throws URISyntaxException + { + String s = toString(scheme, null, + null, userInfo, host, port, + path, query, fragment); + checkPath(s, scheme, path); + new Parser(s).parse(true); + } + + /** + * Constructs a hierarchical URI from the given components. + * + *

If a scheme is given then the path, if also given, must either be + * empty or begin with a slash character ({@code '/'}). Otherwise a + * component of the new URI may be left undefined by passing {@code null} + * for the corresponding parameter. + * + *

This constructor first builds a URI string from the given components + * according to the rules specified in RFC 2396, + * section 5.2, step 7:

+ * + *
    + * + *
  1. Initially, the result string is empty.

  2. + * + *
  3. If a scheme is given then it is appended to the result, + * followed by a colon character ({@code ':'}).

  4. + * + *
  5. If an authority is given then the string {@code "//"} is + * appended, followed by the authority. If the authority contains a + * literal IPv6 address then the address must be enclosed in square + * brackets ({@code '['} and {@code ']'}). Any character not in the + * unreserved, punct, escaped, or other + * categories, and not equal to the commercial-at character + * ({@code '@'}), is quoted.

  6. + * + *
  7. If a path is given then it is appended. Any character not in + * the unreserved, punct, escaped, or other + * categories, and not equal to the slash character ({@code '/'}) or the + * commercial-at character ({@code '@'}), is quoted.

  8. + * + *
  9. If a query is given then a question-mark character + * ({@code '?'}) is appended, followed by the query. Any character that + * is not a legal URI character is quoted. + *

  10. + * + *
  11. Finally, if a fragment is given then a hash character + * ({@code '#'}) is appended, followed by the fragment. Any character + * that is not a legal URI character is quoted.

  12. + * + *
+ * + *

The resulting URI string is then parsed as if by invoking the {@link + * #URI(String)} constructor and then invoking the {@link + * #parseServerAuthority()} method upon the result; this may cause a {@link + * URISyntaxException} to be thrown.

+ * + * @param scheme Scheme name + * @param authority Authority + * @param path Path + * @param query Query + * @param fragment Fragment + * + * @throws URISyntaxException + * If both a scheme and a path are given but the path is relative, + * if the URI string constructed from the given components violates + * RFC 2396, or if the authority component of the string is + * present but cannot be parsed as a server-based authority + */ + public URI(String scheme, + String authority, + String path, String query, String fragment) + throws URISyntaxException + { + String s = toString(scheme, null, + authority, null, null, -1, + path, query, fragment); + checkPath(s, scheme, path); + new Parser(s).parse(false); + } + + /** + * Constructs a hierarchical URI from the given components. + * + *

A component may be left undefined by passing {@code null}. + * + *

This convenience constructor works as if by invoking the + * seven-argument constructor as follows: + * + *

+ * {@code new} {@link #URI(String, String, String, int, String, String, String) + * URI}{@code (scheme, null, host, -1, path, null, fragment);} + *
+ * + * @param scheme Scheme name + * @param host Host name + * @param path Path + * @param fragment Fragment + * + * @throws URISyntaxException + * If the URI string constructed from the given components + * violates RFC 2396 + */ + public URI(String scheme, String host, String path, String fragment) + throws URISyntaxException + { + this(scheme, null, host, -1, path, null, fragment); + } + + /** + * Constructs a URI from the given components. + * + *

A component may be left undefined by passing {@code null}. + * + *

This constructor first builds a URI in string form using the given + * components as follows:

+ * + *
    + * + *
  1. Initially, the result string is empty.

  2. + * + *
  3. If a scheme is given then it is appended to the result, + * followed by a colon character ({@code ':'}).

  4. + * + *
  5. If a scheme-specific part is given then it is appended. Any + * character that is not a legal URI character + * is quoted.

  6. + * + *
  7. Finally, if a fragment is given then a hash character + * ({@code '#'}) is appended to the string, followed by the fragment. + * Any character that is not a legal URI character is quoted.

  8. + * + *
+ * + *

The resulting URI string is then parsed in order to create the new + * URI instance as if by invoking the {@link #URI(String)} constructor; + * this may cause a {@link URISyntaxException} to be thrown.

+ * + * @param scheme Scheme name + * @param ssp Scheme-specific part + * @param fragment Fragment + * + * @throws URISyntaxException + * If the URI string constructed from the given components + * violates RFC 2396 + */ + public URI(String scheme, String ssp, String fragment) + throws URISyntaxException + { + new Parser(toString(scheme, ssp, + null, null, null, -1, + null, null, fragment)) + .parse(false); + } + + /** + * Constructs a simple URI consisting of only a scheme and a pre-validated + * path. Provides a fast-path for some internal cases. + */ + URI(String scheme, String path) { + assert validSchemeAndPath(scheme, path); + this.scheme = scheme; + this.path = path; + } + + private static boolean validSchemeAndPath(String scheme, String path) { + try { + URI u = new URI(scheme + ":" + path); + return scheme.equals(u.scheme) && path.equals(u.path); + } catch (URISyntaxException e) { + return false; + } + } + + /** + * Creates a URI by parsing the given string. + * + *

This convenience factory method works as if by invoking the {@link + * #URI(String)} constructor; any {@link URISyntaxException} thrown by the + * constructor is caught and wrapped in a new {@link + * IllegalArgumentException} object, which is then thrown. + * + *

This method is provided for use in situations where it is known that + * the given string is a legal URI, for example for URI constants declared + * within a program, and so it would be considered a programming error + * for the string not to parse as such. The constructors, which throw + * {@link URISyntaxException} directly, should be used in situations where a + * URI is being constructed from user input or from some other source that + * may be prone to errors.

+ * + * @param str The string to be parsed into a URI + * @return The new URI + * + * @throws NullPointerException + * If {@code str} is {@code null} + * + * @throws IllegalArgumentException + * If the given string violates RFC 2396 + */ + public static URI create(String str) { + try { + return new URI(str); + } catch (URISyntaxException x) { + throw new IllegalArgumentException(x.getMessage(), x); + } + } + + /** + * Converts the given {@code ResourceIdentifier} into a {@code URI}. + * + *

If the given {@link ResourceIdentifier} {@code ri} is a + * {@code URI}, returns {@code uri}. Otherwise, if it is an + * {@code IRI}, obtains a new {@code URI} as if by calling + * {@link IRI#toURI()}. Otherwise, returns a new {@code URI} + * constructed as if by calling {@link #create(String) + * URI.create(ri.toString())}.

+ * + * @apiNote + *

This method is provided for use in situations where an API + * accepts an abstract {@link ResourceIdentifier} as input, on + * the condition that it must be convertible to a concrete instance + * of {@code URI} for further processing.

+ * + * @param ri The {@code ResourceIdentifier} to be converted into + * a {@code URI} + * @return A {@code URI} converted from the given + * {@code ResourceIdentifier}. + * + * @throws NullPointerException + * If {@code ri} is {@code null} + * + * @throws IllegalArgumentException + * If the given {@code ResourceIdentifier} cannot be converted + * to a {@code URI}. + */ + public static URI of(ResourceIdentifier ri) { + try { + return (ri instanceof URI) ? (URI)ri // URI + : (ri instanceof IRI) ? ((IRI)ri).toURI() // IRI + : URI.create(ri.toString()); // throws NPE (ri == null); + } catch (URISyntaxException x) { + throw new IllegalArgumentException(x.getMessage(), x); + } + } + + + // -- Operations -- + + /** + * Attempts to parse this URI's authority component, if defined, into + * user-information, host, and port components. + * + *

If this URI's authority component has already been recognized as + * being server-based then it will already have been parsed into + * user-information, host, and port components. In this case, or if this + * URI has no authority component, this method simply returns this URI. + * + *

Otherwise this method attempts once more to parse the authority + * component into user-information, host, and port components, and throws + * an exception describing why the authority component could not be parsed + * in that way. + * + *

This method is provided because the generic URI syntax specified in + * RFC 2396 + * cannot always distinguish a malformed server-based authority from a + * legitimate registry-based authority. It must therefore treat some + * instances of the former as instances of the latter. The authority + * component in the URI string {@code "//foo:bar"}, for example, is not a + * legal server-based authority but it is legal as a registry-based + * authority. + * + *

In many common situations, for example when working URIs that are + * known to be either URNs or URLs, the hierarchical URIs being used will + * always be server-based. They therefore must either be parsed as such or + * treated as an error. In these cases a statement such as + * + *

+ * {@code URI }u{@code = new URI(str).parseServerAuthority();} + *
+ * + *

can be used to ensure that u always refers to a URI that, if + * it has an authority component, has a server-based authority with proper + * user-information, host, and port components. Invoking this method also + * ensures that if the authority could not be parsed in that way then an + * appropriate diagnostic message can be issued based upon the exception + * that is thrown.

+ * + * @return A URI whose authority field has been parsed + * as a server-based authority + * + * @throws URISyntaxException + * If the authority component of this URI is defined + * but cannot be parsed as a server-based authority + * according to RFC 2396 + */ + public URI parseServerAuthority() + throws URISyntaxException + { + // We could be clever and cache the error message and index from the + // exception thrown during the original parse, but that would require + // either more fields or a more-obscure representation. + if ((host != null) || (authority == null)) + return this; + new Parser(toString()).parse(true); + return this; + } + + /** + * Normalizes this URI's path. + * + *

If this URI is opaque, or if its path is already in normal form, + * then this URI is returned. Otherwise a new URI is constructed that is + * identical to this URI except that its path is computed by normalizing + * this URI's path in a manner consistent with RFC 2396, + * section 5.2, step 6, sub-steps c through f; that is: + *

+ * + *
    + * + *
  1. All {@code "."} segments are removed.

  2. + * + *
  3. If a {@code ".."} segment is preceded by a non-{@code ".."} + * segment then both of these segments are removed. This step is + * repeated until it is no longer applicable.

  4. + * + *
  5. If the path is relative, and if its first segment contains a + * colon character ({@code ':'}), then a {@code "."} segment is + * prepended. This prevents a relative URI with a path such as + * {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a + * scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}. + * (Deviation from RFC 2396)

  6. + * + *
+ * + *

A normalized path will begin with one or more {@code ".."} segments + * if there were insufficient non-{@code ".."} segments preceding them to + * allow their removal. A normalized path will begin with a {@code "."} + * segment if one was inserted by step 3 above. Otherwise, a normalized + * path will not contain any {@code "."} or {@code ".."} segments.

+ * + * @return A URI equivalent to this URI, + * but whose path is in normal form + */ + public URI normalize() { + return normalize(this); + } + + /** + * Resolves the given URI against this URI. + * + *

If the given URI is already absolute, or if this URI is opaque, then + * the given URI is returned. + * + *

If the given URI's fragment component is + * defined, its path component is empty, and its scheme, authority, and + * query components are undefined, then a URI with the given fragment but + * with all other components equal to those of this URI is returned. This + * allows a URI representing a standalone fragment reference, such as + * {@code "#foo"}, to be usefully resolved against a base URI. + * + *

Otherwise this method constructs a new hierarchical URI in a manner + * consistent with RFC 2396, + * section 5.2; that is:

+ * + *
    + * + *
  1. A new URI is constructed with this URI's scheme and the given + * URI's query and fragment components.

  2. + * + *
  3. If the given URI has an authority component then the new URI's + * authority and path are taken from the given URI.

  4. + * + *
  5. Otherwise the new URI's authority component is copied from + * this URI, and its path is computed as follows:

    + * + *
      + * + *
    1. If the given URI's path is absolute then the new URI's path + * is taken from the given URI.

    2. + * + *
    3. Otherwise the given URI's path is relative, and so the new + * URI's path is computed by resolving the path of the given URI + * against the path of this URI. This is done by concatenating all but + * the last segment of this URI's path, if any, with the given URI's + * path and then normalizing the result as if by invoking the {@link + * #normalize() normalize} method.

    4. + * + *
  6. + * + *
+ * + *

The result of this method is absolute if, and only if, either this + * URI is absolute or the given URI is absolute.

+ * + * @param uri The URI to be resolved against this URI + * @return The resulting URI + * + * @throws NullPointerException + * If {@code uri} is {@code null} + */ + public URI resolve(URI uri) { + return resolve(this, uri); + } + + /** + * Constructs a new URI by parsing the given string and then resolving it + * against this URI. + * + *

This convenience method works as if invoking it were equivalent to + * evaluating the expression {@link #resolve(java.net.URI) + * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}.

+ * + * @param str The string to be parsed into a URI + * @return The resulting URI + * + * @throws NullPointerException + * If {@code str} is {@code null} + * + * @throws IllegalArgumentException + * If the given string violates RFC 2396 + */ + public URI resolve(String str) { + return resolve(URI.create(str)); + } + + /** + * Relativizes the given URI against this URI. + * + *

The relativization of the given URI against this URI is computed as + * follows:

+ * + *
    + * + *
  1. If either this URI or the given URI are opaque, or if the + * scheme and authority components of the two URIs are not identical, or + * if the path of this URI is not a prefix of the path of the given URI, + * then the given URI is returned.

  2. + * + *
  3. Otherwise a new relative hierarchical URI is constructed with + * query and fragment components taken from the given URI and with a path + * component computed by removing this URI's path from the beginning of + * the given URI's path.

  4. + * + *
+ * + * @param uri The URI to be relativized against this URI + * @return The resulting URI + * + * @throws NullPointerException + * If {@code uri} is {@code null} + */ + public URI relativize(URI uri) { + return relativize(this, uri); + } + + /** + * Constructs a URL from this URI. + * + *

This convenience method works as if invoking it were equivalent to + * evaluating the expression {@code new URL(this.toString())} after + * first checking that this URI is absolute.

+ * + * @return A URL constructed from this URI + * + * @throws IllegalArgumentException + * If this URL is not absolute + * + * @throws MalformedURLException + * If a protocol handler for the URL could not be found, + * or if some other error occurred while constructing the URL + */ + public URL toURL() throws MalformedURLException { + return fromURI(this); + } + + /** + * Creates a URL from a URI, as if by invoking {@code uri.toURL()}. + * + * @see java.net.URI#toURL() + */ + static URL fromURI(URI uri) throws MalformedURLException { + if (!uri.isAbsolute()) { + throw new IllegalArgumentException("URI is not absolute"); + } + String protocol = uri.getScheme(); + + // In general we need to go via Handler.parseURL, but for the jrt + // protocol we enforce that the Handler is not overrideable and can + // optimize URI to URL conversion. + // + // Case-sensitive comparison for performance; malformed protocols will + // be handled correctly by the slow path. + if (protocol.equals("jrt") && !uri.isOpaque() + && uri.getRawFragment() == null) { + + String query = uri.getRawQuery(); + String path = uri.getRawPath(); + String file = (query == null) ? path : path + "?" + query; + + // URL represent undefined host as empty string while URI use null + String host = uri.getHost(); + if (host == null) { + host = ""; + } + + int port = uri.getPort(); + + return new URL("jrt", host, port, file, null); + } else { + return new URL((URL)null, uri.toString(), null); + } + } + + // -- Component access methods -- + + /** + * Returns the scheme component of this URI. + * + *

The scheme component of a URI, if defined, only contains characters + * in the alphanum category and in the string {@code "-.+"}. A + * scheme always starts with an alpha character.

+ * + * The scheme component of a URI cannot contain escaped octets, hence this + * method does not perform any decoding. + * + * @return The scheme component of this URI, + * or {@code null} if the scheme is undefined + */ + @Override + public String getScheme() { + return scheme; + } + + /** + * Tells whether or not this URI is absolute. + * + *

A URI is absolute if, and only if, it has a scheme component.

+ * + * @return {@code true} if, and only if, this URI is absolute + */ + @Override + public boolean isAbsolute() { + return scheme != null; + } + + /** + * Tells whether or not this URI is opaque. + * + *

A URI is opaque if, and only if, it is absolute and its + * scheme-specific part does not begin with a slash character ('/'). + * An opaque URI has a scheme, a scheme-specific part, and possibly + * a fragment; all other components are undefined.

+ * + * @return {@code true} if, and only if, this URI is opaque + */ + @Override + public boolean isOpaque() { + return path == null; + } + + /** + * Returns the raw scheme-specific part of this URI. The scheme-specific + * part is never undefined, though it may be empty. + * + *

The scheme-specific part of a URI only contains legal URI + * characters.

+ * + * @return The raw scheme-specific part of this URI + * (never {@code null}) + */ + public String getRawSchemeSpecificPart() { + String part = schemeSpecificPart; + if (part != null) { + return part; + } + + String s = string; + if (s != null) { + // if string is defined, components will have been parsed + int start = 0; + int end = s.length(); + if (scheme != null) { + start = scheme.length() + 1; + } + if (fragment != null) { + end -= fragment.length() + 1; + } + if (path != null && path.length() == end - start) { + part = path; + } else { + part = s.substring(start, end); + } + } else { + StringBuilder sb = new StringBuilder(); + appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(), + host, port, getPath(), getQuery()); + part = sb.toString(); + } + return schemeSpecificPart = part; + } + + /** + * Returns the decoded scheme-specific part of this URI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method + * except that all sequences of escaped octets are decoded.

+ * + * @return The decoded scheme-specific part of this URI + * (never {@code null}) + */ + public String getSchemeSpecificPart() { + String part = decodedSchemeSpecificPart; + if (part == null) { + decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart()); + } + return part; + } + + /** + * Returns the raw authority component of this URI. + * + *

The authority component of a URI, if defined, only contains the + * commercial-at character ({@code '@'}) and characters in the + * unreserved, punct, escaped, and other + * categories. If the authority is server-based then it is further + * constrained to have valid user-information, host, and port + * components.

+ * + * @return The raw authority component of this URI, + * or {@code null} if the authority is undefined + */ + public String getRawAuthority() { + return authority; + } + + /** + * Returns the decoded authority component of this URI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawAuthority() getRawAuthority} method except that all + * sequences of escaped octets are decoded.

+ * + * @return The decoded authority component of this URI, + * or {@code null} if the authority is undefined + */ + @Override + public String getAuthority() { + String auth = decodedAuthority; + if ((auth == null) && (authority != null)) { + decodedAuthority = auth = decode(authority); + } + return auth; + } + + /** + * Returns the raw user-information component of this URI. + * + *

The user-information component of a URI, if defined, only contains + * characters in the unreserved, punct, escaped, and + * other categories.

+ * + * @return The raw user-information component of this URI, + * or {@code null} if the user information is undefined + */ + @Override + public String getRawUserInfo() { + return userInfo; + } + + /** + * Returns the decoded user-information component of this URI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawUserInfo() getRawUserInfo} method except that all + * sequences of escaped octets are decoded.

+ * + * @return The decoded user-information component of this URI, + * or {@code null} if the user information is undefined + */ + @Override + public String getUserInfo() { + String user = decodedUserInfo; + if ((user == null) && (userInfo != null)) { + decodedUserInfo = user = decode(userInfo); + } + return user; + } + + /** + * Returns the host component of this URI. + * + *

The host component of a URI, if defined, will have one of the + * following forms:

+ * + *
    + * + *
  • A domain name consisting of one or more labels + * separated by period characters ({@code '.'}), optionally followed by + * a period character. Each label consists of alphanum characters + * as well as hyphen characters ({@code '-'}), though hyphens never + * occur as the first or last characters in a label. The rightmost + * label of a domain name consisting of two or more labels, begins + * with an alpha character.

  • + * + *
  • A dotted-quad IPv4 address of the form + * digit{@code +.}digit{@code +.}digit{@code +.}digit{@code +}, + * where no digit sequence is longer than three characters and no + * sequence has a value larger than 255.

  • + * + *
  • An IPv6 address enclosed in square brackets ({@code '['} and + * {@code ']'}) and consisting of hexadecimal digits, colon characters + * ({@code ':'}), and possibly an embedded IPv4 address. The full + * syntax of IPv6 addresses is specified in RFC 2373: IPv6 + * Addressing Architecture.

  • + * + *
+ * + * The host component of a URI cannot contain escaped octets, hence this + * method does not perform any decoding. + * + * @return The host component of this URI, + * or {@code null} if the host is undefined + */ + @Override + public String getHost() { + return host; + } + + /** + * Returns the port number of this URI. + * + *

The port component of a URI, if defined, is a non-negative + * integer.

+ * + * @return The port component of this URI, + * or {@code -1} if the port is undefined + */ + @Override + public int getPort() { + return port; + } + + /** + * Returns the raw path component of this URI. + * + *

The path component of a URI, if defined, only contains the slash + * character ({@code '/'}), the commercial-at character ({@code '@'}), + * and characters in the unreserved, punct, escaped, + * and other categories.

+ * + * @return The path component of this URI, + * or {@code null} if the path is undefined + */ + @Override + public String getRawPath() { + return path; + } + + /** + * Returns the decoded path component of this URI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawPath() getRawPath} method except that all sequences of + * escaped octets are decoded.

+ * + * @return The decoded path component of this URI, + * or {@code null} if the path is undefined + */ + @Override + public String getPath() { + String decoded = decodedPath; + if ((decoded == null) && (path != null)) { + decodedPath = decoded = decode(path); + } + return decoded; + } + + /** + * Returns the raw query component of this URI. + * + *

The query component of a URI, if defined, only contains legal URI + * characters.

+ * + * @return The raw query component of this URI, + * or {@code null} if the query is undefined + */ + @Override + public String getRawQuery() { + return query; + } + + /** + * Returns the decoded query component of this URI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawQuery() getRawQuery} method except that all sequences of + * escaped octets are decoded.

+ * + * @return The decoded query component of this URI, + * or {@code null} if the query is undefined + */ + @Override + public String getQuery() { + String decoded = decodedQuery; + if ((decoded == null) && (query != null)) { + decodedQuery = decoded = decode(query, false); + } + return decoded; + } + + /** + * Returns the raw fragment component of this URI. + * + *

The fragment component of a URI, if defined, only contains legal URI + * characters.

+ * + * @return The raw fragment component of this URI, + * or {@code null} if the fragment is undefined + */ + @Override + public String getRawFragment() { + return fragment; + } + + /** + * Returns the decoded fragment component of this URI. + * + *

The string returned by this method is equal to that returned by the + * {@link #getRawFragment() getRawFragment} method except that all + * sequences of escaped octets are decoded.

+ * + * @return The decoded fragment component of this URI, + * or {@code null} if the fragment is undefined + */ + @Override + public String getFragment() { + String decoded = decodedFragment; + if ((decoded == null) && (fragment != null)) { + decodedFragment = decoded = decode(fragment, false); + } + return decoded; + } + + + // -- Equality, comparison, hash code, toString, and serialization -- + + /** + * Tests this URI for equality with another object. + * + *

If the given object is not a URI then this method immediately + * returns {@code false}. + * + *

For two URIs to be considered equal requires that either both are + * opaque or both are hierarchical. Their schemes must either both be + * undefined or else be equal without regard to case. Their fragments + * must either both be undefined or else be equal. + * + *

For two opaque URIs to be considered equal, their scheme-specific + * parts must be equal. + * + *

For two hierarchical URIs to be considered equal, their paths must + * be equal and their queries must either both be undefined or else be + * equal. Their authorities must either both be undefined, or both be + * registry-based, or both be server-based. If their authorities are + * defined and are registry-based, then they must be equal. If their + * authorities are defined and are server-based, then their hosts must be + * equal without regard to case, their port numbers must be equal, and + * their user-information components must be equal. + * + *

When testing the user-information, path, query, fragment, authority, + * or scheme-specific parts of two URIs for equality, the raw forms rather + * than the encoded forms of these components are compared and the + * hexadecimal digits of escaped octets are compared without regard to + * case. + * + *

This method satisfies the general contract of the {@link + * java.lang.Object#equals(Object) Object.equals} method.

+ * + * @param ob The object to which this object is to be compared + * + * @return {@code true} if, and only if, the given object is a URI that + * is identical to this URI + */ + public boolean equals(Object ob) { + if (ob == this) + return true; + if (!(ob instanceof URI)) + return false; + URI that = (URI)ob; + if (this.isOpaque() != that.isOpaque()) return false; + if (!equalIgnoringCase(this.scheme, that.scheme)) return false; + if (!equal(this.fragment, that.fragment)) return false; + + // Opaque + if (this.isOpaque()) + return equal(this.schemeSpecificPart, that.schemeSpecificPart); + + // Hierarchical + if (!equal(this.path, that.path)) return false; + if (!equal(this.query, that.query)) return false; + + // Authorities + if (this.authority == that.authority) return true; + if (this.host != null) { + // Server-based + if (!equal(this.userInfo, that.userInfo)) return false; + if (!equalIgnoringCase(this.host, that.host)) return false; + if (this.port != that.port) return false; + } else if (this.authority != null) { + // Registry-based + if (!equal(this.authority, that.authority)) return false; + } else if (this.authority != that.authority) { + return false; + } + + return true; + } + + /** + * Returns a hash-code value for this URI. The hash code is based upon all + * of the URI's components, and satisfies the general contract of the + * {@link java.lang.Object#hashCode() Object.hashCode} method. + * + * @return A hash-code value for this URI + */ + public int hashCode() { + int h = hash; + if (h == 0) { + h = hashIgnoringCase(0, scheme); + h = hash(h, fragment); + if (isOpaque()) { + h = hash(h, schemeSpecificPart); + } else { + h = hash(h, path); + h = hash(h, query); + if (host != null) { + h = hash(h, userInfo); + h = hashIgnoringCase(h, host); + h += 1949 * port; + } else { + h = hash(h, authority); + } + } + if (h != 0) { + hash = h; + } + } + return h; + } + + /** + * Compares this URI to another object, which must be a URI. + * + *

When comparing corresponding components of two URIs, if one + * component is undefined but the other is defined then the first is + * considered to be less than the second. Unless otherwise noted, string + * components are ordered according to their natural, case-sensitive + * ordering as defined by the {@link java.lang.String#compareTo(Object) + * String.compareTo} method. String components that are subject to + * encoding are compared by comparing their raw forms rather than their + * encoded forms. + * + *

The ordering of URIs is defined as follows:

+ * + *
    + * + *
  • Two URIs with different schemes are ordered according the + * ordering of their schemes, without regard to case.

  • + * + *
  • A hierarchical URI is considered to be less than an opaque URI + * with an identical scheme.

  • + * + *
  • Two opaque URIs with identical schemes are ordered according + * to the ordering of their scheme-specific parts.

  • + * + *
  • Two opaque URIs with identical schemes and scheme-specific + * parts are ordered according to the ordering of their + * fragments.

  • + * + *
  • Two hierarchical URIs with identical schemes are ordered + * according to the ordering of their authority components:

    + * + *
      + * + *
    • If both authority components are server-based then the URIs + * are ordered according to their user-information components; if these + * components are identical then the URIs are ordered according to the + * ordering of their hosts, without regard to case; if the hosts are + * identical then the URIs are ordered according to the ordering of + * their ports.

    • + * + *
    • If one or both authority components are registry-based then + * the URIs are ordered according to the ordering of their authority + * components.

    • + * + *
  • + * + *
  • Finally, two hierarchical URIs with identical schemes and + * authority components are ordered according to the ordering of their + * paths; if their paths are identical then they are ordered according to + * the ordering of their queries; if the queries are identical then they + * are ordered according to the order of their fragments.

  • + * + *
+ * + *

This method satisfies the general contract of the {@link + * java.lang.Comparable#compareTo(Object) Comparable.compareTo} + * method.

+ * + * @param that + * The object to which this URI is to be compared + * + * @return A negative integer, zero, or a positive integer as this URI is + * less than, equal to, or greater than the given URI + * + * @throws ClassCastException + * If the given object is not a URI + */ + public int compareTo(URI that) { + int c; + + if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0) + return c; + + if (this.isOpaque()) { + if (that.isOpaque()) { + // Both opaque + if ((c = compare(this.schemeSpecificPart, + that.schemeSpecificPart)) != 0) + return c; + return compare(this.fragment, that.fragment); + } + return +1; // Opaque > hierarchical + } else if (that.isOpaque()) { + return -1; // Hierarchical < opaque + } + + // Hierarchical + if ((this.host != null) && (that.host != null)) { + // Both server-based + if ((c = compare(this.userInfo, that.userInfo)) != 0) + return c; + if ((c = compareIgnoringCase(this.host, that.host)) != 0) + return c; + if ((c = this.port - that.port) != 0) + return c; + } else { + // If one or both authorities are registry-based then we simply + // compare them in the usual, case-sensitive way. If one is + // registry-based and one is server-based then the strings are + // guaranteed to be unequal, hence the comparison will never return + // zero and the compareTo and equals methods will remain + // consistent. + if ((c = compare(this.authority, that.authority)) != 0) return c; + } + + if ((c = compare(this.path, that.path)) != 0) return c; + if ((c = compare(this.query, that.query)) != 0) return c; + return compare(this.fragment, that.fragment); + } + + /** + * Returns the content of this URI as a string. + * + *

If this URI was created by invoking one of the constructors in this + * class then a string equivalent to the original input string, or to the + * string computed from the originally-given components, as appropriate, is + * returned. Otherwise this URI was created by normalization, resolution, + * or relativization, and so a string is constructed from this URI's + * components according to the rules specified in RFC 2396, + * section 5.2, step 7.

+ * + * @return The string form of this URI + */ + @Override + public String toString() { + String s = string; + if (s == null) { + s = defineString(); + } + return s; + } + + private String defineString() { + String s = string; + if (s != null) { + return s; + } + + StringBuilder sb = new StringBuilder(); + if (scheme != null) { + sb.append(scheme); + sb.append(':'); + } + if (isOpaque()) { + sb.append(schemeSpecificPart); + } else { + if (host != null) { + sb.append("//"); + if (userInfo != null) { + sb.append(userInfo); + sb.append('@'); + } + boolean needBrackets = ((host.indexOf(':') >= 0) + && !host.startsWith("[") + && !host.endsWith("]")); + if (needBrackets) sb.append('['); + sb.append(host); + if (needBrackets) sb.append(']'); + if (port != -1) { + sb.append(':'); + sb.append(port); + } + } else if (authority != null) { + sb.append("//"); + sb.append(authority); + } + if (path != null) + sb.append(path); + if (query != null) { + sb.append('?'); + sb.append(query); + } + } + if (fragment != null) { + sb.append('#'); + sb.append(fragment); + } + return string = sb.toString(); + } + + /** + * Returns the content of this URI as a US-ASCII string. + * + *

If this URI does not contain any characters in the other + * category then an invocation of this method will return the same value as + * an invocation of the {@link #toString() toString} method. Otherwise + * this method works as if by invoking that method and then encoding the result.

+ * + * @return The string form of this URI, encoded as needed + * so that it only contains characters in the US-ASCII + * charset + */ + @Override + public String toASCIIString() { + return encode(toString()); + } + + + // -- Serialization support -- + + /** + * Saves the content of this URI to the given serial stream. + * + *

The only serializable field of a URI instance is its {@code string} + * field. That field is given a value, if it does not have one already, + * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()} + * method of the given object-output stream is invoked.

+ * + * @param os The object-output stream to which this object + * is to be written + */ + private void writeObject(ObjectOutputStream os) + throws IOException + { + defineString(); + os.defaultWriteObject(); // Writes the string field only + } + + /** + * Reconstitutes a URI from the given serial stream. + * + *

The {@link java.io.ObjectInputStream#defaultReadObject()} method is + * invoked to read the value of the {@code string} field. The result is + * then parsed in the usual way. + * + * @param is The object-input stream from which this object + * is being read + */ + private void readObject(ObjectInputStream is) + throws ClassNotFoundException, IOException + { + port = -1; // Argh + is.defaultReadObject(); + try { + new Parser(string).parse(false); + } catch (URISyntaxException x) { + IOException y = new InvalidObjectException("Invalid URI"); + y.initCause(x); + throw y; + } + } + + + // -- End of public methods -- + + + // -- Utility methods for string-field comparison and hashing -- + + // These methods return appropriate values for null string arguments, + // thereby simplifying the equals, hashCode, and compareTo methods. + // + // The case-ignoring methods should only be applied to strings whose + // characters are all known to be US-ASCII. Because of this restriction, + // these methods are faster than the similar methods in the String class. + + // US-ASCII only + private static int toLower(char c) { + if ((c >= 'A') && (c <= 'Z')) + return c + ('a' - 'A'); + return c; + } + + // US-ASCII only + private static int toUpper(char c) { + if ((c >= 'a') && (c <= 'z')) + return c - ('a' - 'A'); + return c; + } + + private static boolean equal(String s, String t) { + if (s == t) return true; + if ((s != null) && (t != null)) { + if (s.length() != t.length()) + return false; + if (s.indexOf('%') < 0) + return s.equals(t); + int n = s.length(); + for (int i = 0; i < n;) { + char c = s.charAt(i); + char d = t.charAt(i); + if (c != '%') { + if (c != d) + return false; + i++; + continue; + } + if (d != '%') + return false; + i++; + if (toLower(s.charAt(i)) != toLower(t.charAt(i))) + return false; + i++; + if (toLower(s.charAt(i)) != toLower(t.charAt(i))) + return false; + i++; + } + return true; + } + return false; + } + + // US-ASCII only + private static boolean equalIgnoringCase(String s, String t) { + if (s == t) return true; + if ((s != null) && (t != null)) { + int n = s.length(); + if (t.length() != n) + return false; + for (int i = 0; i < n; i++) { + if (toLower(s.charAt(i)) != toLower(t.charAt(i))) + return false; + } + return true; + } + return false; + } + + private static int hash(int hash, String s) { + if (s == null) return hash; + return s.indexOf('%') < 0 ? hash * 127 + s.hashCode() + : normalizedHash(hash, s); + } + + + private static int normalizedHash(int hash, String s) { + int h = 0; + for (int index = 0; index < s.length(); index++) { + char ch = s.charAt(index); + h = 31 * h + ch; + if (ch == '%') { + /* + * Process the next two encoded characters + */ + for (int i = index + 1; i < index + 3; i++) + h = 31 * h + toUpper(s.charAt(i)); + index += 2; + } + } + return hash * 127 + h; + } + + // US-ASCII only + private static int hashIgnoringCase(int hash, String s) { + if (s == null) return hash; + int h = hash; + int n = s.length(); + for (int i = 0; i < n; i++) + h = 31 * h + toLower(s.charAt(i)); + return h; + } + + private static int compare(String s, String t) { + if (s == t) return 0; + if (s != null) { + if (t != null) + return s.compareTo(t); + else + return +1; + } else { + return -1; + } + } + + // US-ASCII only + private static int compareIgnoringCase(String s, String t) { + if (s == t) return 0; + if (s != null) { + if (t != null) { + int sn = s.length(); + int tn = t.length(); + int n = sn < tn ? sn : tn; + for (int i = 0; i < n; i++) { + int c = toLower(s.charAt(i)) - toLower(t.charAt(i)); + if (c != 0) + return c; + } + return sn - tn; + } + return +1; + } else { + return -1; + } + } + + + // -- String construction -- + + // If a scheme is given then the path, if given, must be absolute + // + private static void checkPath(String s, String scheme, String path) + throws URISyntaxException + { + if (scheme != null) { + if (path != null && !path.isEmpty() && path.charAt(0) != '/') + throw new URISyntaxException(s, "Relative path in absolute URI"); + } + } + + private void appendAuthority(StringBuilder sb, + String authority, + String userInfo, + String host, + int port) + { + if (host != null) { + sb.append("//"); + if (userInfo != null) { + sb.append(quote(userInfo, L_USERINFO, H_USERINFO)); + sb.append('@'); + } + boolean needBrackets = ((host.indexOf(':') >= 0) + && !host.startsWith("[") + && !host.endsWith("]")); + if (needBrackets) sb.append('['); + sb.append(host); + if (needBrackets) sb.append(']'); + if (port != -1) { + sb.append(':'); + sb.append(port); + } + } else if (authority != null) { + sb.append("//"); + if (authority.startsWith("[")) { + // authority should (but may not) contain an embedded IPv6 address + int end = authority.indexOf(']'); + String doquote = authority, dontquote = ""; + if (end != -1 && authority.indexOf(':') != -1) { + // the authority contains an IPv6 address + if (end == authority.length()) { + dontquote = authority; + doquote = ""; + } else { + dontquote = authority.substring(0 , end + 1); + doquote = authority.substring(end + 1); + } + } + sb.append(dontquote); + sb.append(quote(doquote, + L_REG_NAME | L_SERVER, + H_REG_NAME | H_SERVER)); + } else { + sb.append(quote(authority, + L_REG_NAME | L_SERVER, + H_REG_NAME | H_SERVER)); + } + } + } + + private void appendSchemeSpecificPart(StringBuilder sb, + String opaquePart, + String authority, + String userInfo, + String host, + int port, + String path, + String query) + { + if (opaquePart != null) { + /* check if SSP begins with an IPv6 address + * because we must not quote a literal IPv6 address + */ + if (opaquePart.startsWith("//[")) { + int end = opaquePart.indexOf(']'); + if (end != -1 && opaquePart.indexOf(':')!=-1) { + String doquote, dontquote; + if (end == opaquePart.length()) { + dontquote = opaquePart; + doquote = ""; + } else { + dontquote = opaquePart.substring(0,end+1); + doquote = opaquePart.substring(end+1); + } + sb.append (dontquote); + sb.append(quote(doquote, L_URIC, H_URIC)); + } + } else { + sb.append(quote(opaquePart, L_URIC, H_URIC)); + } + } else { + appendAuthority(sb, authority, userInfo, host, port); + if (path != null) + sb.append(quote(path, L_PATH, H_PATH)); + if (query != null) { + sb.append('?'); + sb.append(quote(query, L_URIC, H_URIC)); + } + } + } + + private void appendFragment(StringBuilder sb, String fragment) { + if (fragment != null) { + sb.append('#'); + sb.append(quote(fragment, L_URIC, H_URIC)); + } + } + + private String toString(String scheme, + String opaquePart, + String authority, + String userInfo, + String host, + int port, + String path, + String query, + String fragment) + { + StringBuilder sb = new StringBuilder(); + if (scheme != null) { + sb.append(scheme); + sb.append(':'); + } + appendSchemeSpecificPart(sb, opaquePart, + authority, userInfo, host, port, + path, query); + appendFragment(sb, fragment); + return sb.toString(); + } + + // -- Normalization, resolution, and relativization -- + + // RFC2396 5.2 (6) + private static String resolvePath(String base, String child, + boolean absolute) + { + int i = base.lastIndexOf('/'); + int cn = child.length(); + String path = ""; + + if (cn == 0) { + // 5.2 (6a) + if (i >= 0) + path = base.substring(0, i + 1); + } else { + StringBuilder sb = new StringBuilder(base.length() + cn); + // 5.2 (6a) + if (i >= 0) + sb.append(base, 0, i + 1); + // 5.2 (6b) + sb.append(child); + path = sb.toString(); + } + + // 5.2 (6c-f) + String np = normalize(path); + + // 5.2 (6g): If the result is absolute but the path begins with "../", + // then we simply leave the path as-is + + return np; + } + + // RFC2396 5.2 + private static URI resolve(URI base, URI child) { + // check if child if opaque first so that NPE is thrown + // if child is null. + if (child.isOpaque() || base.isOpaque()) + return child; + + // 5.2 (2): Reference to current document (lone fragment) + if ((child.scheme == null) && (child.authority == null) + && child.path.isEmpty() && (child.fragment != null) + && (child.query == null)) { + if ((base.fragment != null) + && child.fragment.equals(base.fragment)) { + return base; + } + URI ru = new URI(); + ru.scheme = base.scheme; + ru.authority = base.authority; + ru.userInfo = base.userInfo; + ru.host = base.host; + ru.port = base.port; + ru.path = base.path; + ru.fragment = child.fragment; + ru.query = base.query; + return ru; + } + + // 5.2 (3): Child is absolute + if (child.scheme != null) + return child; + + URI ru = new URI(); // Resolved URI + ru.scheme = base.scheme; + ru.query = child.query; + ru.fragment = child.fragment; + + // 5.2 (4): Authority + if (child.authority == null) { + ru.authority = base.authority; + ru.host = base.host; + ru.userInfo = base.userInfo; + ru.port = base.port; + + String cp = (child.path == null) ? "" : child.path; + if (!cp.isEmpty() && cp.charAt(0) == '/') { + // 5.2 (5): Child path is absolute + ru.path = child.path; + } else { + // 5.2 (6): Resolve relative path + ru.path = resolvePath(base.path, cp, base.isAbsolute()); + } + } else { + ru.authority = child.authority; + ru.host = child.host; + ru.userInfo = child.userInfo; + ru.host = child.host; + ru.port = child.port; + ru.path = child.path; + } + + // 5.2 (7): Recombine (nothing to do here) + return ru; + } + + // If the given URI's path is normal then return the URI; + // o.w., return a new URI containing the normalized path. + // + private static URI normalize(URI u) { + if (u.isOpaque() || u.path == null || u.path.isEmpty()) + return u; + + String np = normalize(u.path); + if (np == u.path) + return u; + + URI v = new URI(); + v.scheme = u.scheme; + v.fragment = u.fragment; + v.authority = u.authority; + v.userInfo = u.userInfo; + v.host = u.host; + v.port = u.port; + v.path = np; + v.query = u.query; + return v; + } + + // If both URIs are hierarchical, their scheme and authority components are + // identical, and the base path is a prefix of the child's path, then + // return a relative URI that, when resolved against the base, yields the + // child; otherwise, return the child. + // + private static URI relativize(URI base, URI child) { + // check if child if opaque first so that NPE is thrown + // if child is null. + if (child.isOpaque() || base.isOpaque()) + return child; + if (!equalIgnoringCase(base.scheme, child.scheme) + || !equal(base.authority, child.authority)) + return child; + + String bp = normalize(base.path); + String cp = normalize(child.path); + if (!bp.equals(cp)) { + if (!bp.endsWith("/")) + bp = bp + "/"; + if (!cp.startsWith(bp)) + return child; + } + + URI v = new URI(); + v.path = cp.substring(bp.length()); + v.query = child.query; + v.fragment = child.fragment; + return v; + } + + + + // -- Path normalization -- + + // The following algorithm for path normalization avoids the creation of a + // string object for each segment, as well as the use of a string buffer to + // compute the final result, by using a single char array and editing it in + // place. The array is first split into segments, replacing each slash + // with '\0' and creating a segment-index array, each element of which is + // the index of the first char in the corresponding segment. We then walk + // through both arrays, removing ".", "..", and other segments as necessary + // by setting their entries in the index array to -1. Finally, the two + // arrays are used to rejoin the segments and compute the final result. + // + // This code is based upon src/solaris/native/java/io/canonicalize_md.c + + + // Check the given path to see if it might need normalization. A path + // might need normalization if it contains duplicate slashes, a "." + // segment, or a ".." segment. Return -1 if no further normalization is + // possible, otherwise return the number of segments found. + // + // This method takes a string argument rather than a char array so that + // this test can be performed without invoking path.toCharArray(). + // + private static int needsNormalization(String path) { + boolean normal = true; + int ns = 0; // Number of segments + int end = path.length() - 1; // Index of last char in path + int p = 0; // Index of next char in path + + // Skip initial slashes + while (p <= end) { + if (path.charAt(p) != '/') break; + p++; + } + if (p > 1) normal = false; + + // Scan segments + while (p <= end) { + + // Looking at "." or ".." ? + if ((path.charAt(p) == '.') + && ((p == end) + || ((path.charAt(p + 1) == '/') + || ((path.charAt(p + 1) == '.') + && ((p + 1 == end) + || (path.charAt(p + 2) == '/')))))) { + normal = false; + } + ns++; + + // Find beginning of next segment + while (p <= end) { + if (path.charAt(p++) != '/') + continue; + + // Skip redundant slashes + while (p <= end) { + if (path.charAt(p) != '/') break; + normal = false; + p++; + } + + break; + } + } + + return normal ? -1 : ns; + } + + + // Split the given path into segments, replacing slashes with nulls and + // filling in the given segment-index array. + // + // Preconditions: + // segs.length == Number of segments in path + // + // Postconditions: + // All slashes in path replaced by '\0' + // segs[i] == Index of first char in segment i (0 <= i < segs.length) + // + private static void split(char[] path, int[] segs) { + int end = path.length - 1; // Index of last char in path + int p = 0; // Index of next char in path + int i = 0; // Index of current segment + + // Skip initial slashes + while (p <= end) { + if (path[p] != '/') break; + path[p] = '\0'; + p++; + } + + while (p <= end) { + + // Note start of segment + segs[i++] = p++; + + // Find beginning of next segment + while (p <= end) { + if (path[p++] != '/') + continue; + path[p - 1] = '\0'; + + // Skip redundant slashes + while (p <= end) { + if (path[p] != '/') break; + path[p++] = '\0'; + } + break; + } + } + + if (i != segs.length) + throw new InternalError(); // ASSERT + } + + + // Join the segments in the given path according to the given segment-index + // array, ignoring those segments whose index entries have been set to -1, + // and inserting slashes as needed. Return the length of the resulting + // path. + // + // Preconditions: + // segs[i] == -1 implies segment i is to be ignored + // path computed by split, as above, with '\0' having replaced '/' + // + // Postconditions: + // path[0] .. path[return value] == Resulting path + // + private static int join(char[] path, int[] segs) { + int ns = segs.length; // Number of segments + int end = path.length - 1; // Index of last char in path + int p = 0; // Index of next path char to write + + if (path[p] == '\0') { + // Restore initial slash for absolute paths + path[p++] = '/'; + } + + for (int i = 0; i < ns; i++) { + int q = segs[i]; // Current segment + if (q == -1) + // Ignore this segment + continue; + + if (p == q) { + // We're already at this segment, so just skip to its end + while ((p <= end) && (path[p] != '\0')) + p++; + if (p <= end) { + // Preserve trailing slash + path[p++] = '/'; + } + } else if (p < q) { + // Copy q down to p + while ((q <= end) && (path[q] != '\0')) + path[p++] = path[q++]; + if (q <= end) { + // Preserve trailing slash + path[p++] = '/'; + } + } else + throw new InternalError(); // ASSERT false + } + + return p; + } + + + // Remove "." segments from the given path, and remove segment pairs + // consisting of a non-".." segment followed by a ".." segment. + // + private static void removeDots(char[] path, int[] segs) { + int ns = segs.length; + int end = path.length - 1; + + for (int i = 0; i < ns; i++) { + int dots = 0; // Number of dots found (0, 1, or 2) + + // Find next occurrence of "." or ".." + do { + int p = segs[i]; + if (path[p] == '.') { + if (p == end) { + dots = 1; + break; + } else if (path[p + 1] == '\0') { + dots = 1; + break; + } else if ((path[p + 1] == '.') + && ((p + 1 == end) + || (path[p + 2] == '\0'))) { + dots = 2; + break; + } + } + i++; + } while (i < ns); + if ((i > ns) || (dots == 0)) + break; + + if (dots == 1) { + // Remove this occurrence of "." + segs[i] = -1; + } else { + // If there is a preceding non-".." segment, remove both that + // segment and this occurrence of ".."; otherwise, leave this + // ".." segment as-is. + int j; + for (j = i - 1; j >= 0; j--) { + if (segs[j] != -1) break; + } + if (j >= 0) { + int q = segs[j]; + if (!((path[q] == '.') + && (path[q + 1] == '.') + && (path[q + 2] == '\0'))) { + segs[i] = -1; + segs[j] = -1; + } + } + } + } + } + + + // DEVIATION: If the normalized path is relative, and if the first + // segment could be parsed as a scheme name, then prepend a "." segment + // + private static void maybeAddLeadingDot(char[] path, int[] segs) { + + if (path[0] == '\0') + // The path is absolute + return; + + int ns = segs.length; + int f = 0; // Index of first segment + while (f < ns) { + if (segs[f] >= 0) + break; + f++; + } + if ((f >= ns) || (f == 0)) + // The path is empty, or else the original first segment survived, + // in which case we already know that no leading "." is needed + return; + + int p = segs[f]; + while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++; + if (p >= path.length || path[p] == '\0') + // No colon in first segment, so no "." needed + return; + + // At this point we know that the first segment is unused, + // hence we can insert a "." segment at that position + path[0] = '.'; + path[1] = '\0'; + segs[0] = 0; + } + + + // Normalize the given path string. A normal path string has no empty + // segments (i.e., occurrences of "//"), no segments equal to ".", and no + // segments equal to ".." that are preceded by a segment not equal to "..". + // In contrast to Unix-style pathname normalization, for URI paths we + // always retain trailing slashes. + // + private static String normalize(String ps) { + + // Does this path need normalization? + int ns = needsNormalization(ps); // Number of segments + if (ns < 0) + // Nope -- just return it + return ps; + + char[] path = ps.toCharArray(); // Path in char-array form + + // Split path into segments + int[] segs = new int[ns]; // Segment-index array + split(path, segs); + + // Remove dots + removeDots(path, segs); + + // Prevent scheme-name confusion + maybeAddLeadingDot(path, segs); + + // Join the remaining segments and return the result + String s = new String(path, 0, join(path, segs)); + if (s.equals(ps)) { + // string was already normalized + return ps; + } + return s; + } + + + + // -- Character classes for parsing -- + + // RFC2396 precisely specifies which characters in the US-ASCII charset are + // permissible in the various components of a URI reference. We here + // define a set of mask pairs to aid in enforcing these restrictions. Each + // mask pair consists of two longs, a low mask and a high mask. Taken + // together they represent a 128-bit mask, where bit i is set iff the + // character with value i is permitted. + // + // This approach is more efficient than sequentially searching arrays of + // permitted characters. It could be made still more efficient by + // precompiling the mask information so that a character's presence in a + // given mask could be determined by a single table lookup. + + // To save startup time, we manually calculate the low-/highMask constants. + // For reference, the following methods were used to calculate the values: + + // Compute the low-order mask for the characters in the given string + // private static long lowMask(String chars) { + // int n = chars.length(); + // long m = 0; + // for (int i = 0; i < n; i++) { + // char c = chars.charAt(i); + // if (c < 64) + // m |= (1L << c); + // } + // return m; + // } + + // Compute the high-order mask for the characters in the given string + // private static long highMask(String chars) { + // int n = chars.length(); + // long m = 0; + // for (int i = 0; i < n; i++) { + // char c = chars.charAt(i); + // if ((c >= 64) && (c < 128)) + // m |= (1L << (c - 64)); + // } + // return m; + // } + + // Compute a low-order mask for the characters + // between first and last, inclusive + // private static long lowMask(char first, char last) { + // long m = 0; + // int f = Math.max(Math.min(first, 63), 0); + // int l = Math.max(Math.min(last, 63), 0); + // for (int i = f; i <= l; i++) + // m |= 1L << i; + // return m; + // } + + // Compute a high-order mask for the characters + // between first and last, inclusive + // private static long highMask(char first, char last) { + // long m = 0; + // int f = Math.max(Math.min(first, 127), 64) - 64; + // int l = Math.max(Math.min(last, 127), 64) - 64; + // for (int i = f; i <= l; i++) + // m |= 1L << i; + // return m; + // } + + // Tell whether the given character is permitted by the given mask pair + private static boolean match(char c, long lowMask, long highMask) { + if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches. + return false; + if (c < 64) + return ((1L << c) & lowMask) != 0; + if (c < 128) + return ((1L << (c - 64)) & highMask) != 0; + return false; + } + + // Character-class masks, in reverse order from RFC2396 because + // initializers for static fields cannot make forward references. + + // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | + // "8" | "9" + private static final long L_DIGIT = 0x3FF000000000000L; // lowMask('0', '9'); + private static final long H_DIGIT = 0L; + + // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | + // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | + // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" + private static final long L_UPALPHA = 0L; + private static final long H_UPALPHA = 0x7FFFFFEL; // highMask('A', 'Z'); + + // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | + // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | + // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" + private static final long L_LOWALPHA = 0L; + private static final long H_LOWALPHA = 0x7FFFFFE00000000L; // highMask('a', 'z'); + + // alpha = lowalpha | upalpha + private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA; + private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA; + + // alphanum = alpha | digit + private static final long L_ALPHANUM = L_DIGIT | L_ALPHA; + private static final long H_ALPHANUM = H_DIGIT | H_ALPHA; + + // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | + // "a" | "b" | "c" | "d" | "e" | "f" + private static final long L_HEX = L_DIGIT; + private static final long H_HEX = 0x7E0000007EL; // highMask('A', 'F') | highMask('a', 'f'); + + // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | + // "(" | ")" + private static final long L_MARK = 0x678200000000L; // lowMask("-_.!~*'()"); + private static final long H_MARK = 0x4000000080000000L; // highMask("-_.!~*'()"); + + // unreserved = alphanum | mark + private static final long L_UNRESERVED = L_ALPHANUM | L_MARK; + private static final long H_UNRESERVED = H_ALPHANUM | H_MARK; + + // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | + // "$" | "," | "[" | "]" + // Added per RFC2732: "[", "]" + private static final long L_RESERVED = 0xAC00985000000000L; // lowMask(";/?:@&=+$,[]"); + private static final long H_RESERVED = 0x28000001L; // highMask(";/?:@&=+$,[]"); + + // The zero'th bit is used to indicate that escape pairs and non-US-ASCII + // characters are allowed; this is handled by the scanEscape method below. + private static final long L_ESCAPED = 1L; + private static final long H_ESCAPED = 0L; + + // uric = reserved | unreserved | escaped + private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED; + private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED; + + // pchar = unreserved | escaped | + // ":" | "@" | "&" | "=" | "+" | "$" | "," + private static final long L_PCHAR + = L_UNRESERVED | L_ESCAPED | 0x2400185000000000L; // lowMask(":@&=+$,"); + private static final long H_PCHAR + = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask(":@&=+$,"); + + // All valid path characters + private static final long L_PATH = L_PCHAR | 0x800800000000000L; // lowMask(";/"); + private static final long H_PATH = H_PCHAR; // highMask(";/") == 0x0L; + + // Dash, for use in domainlabel and toplabel + private static final long L_DASH = 0x200000000000L; // lowMask("-"); + private static final long H_DASH = 0x0L; // highMask("-"); + + // Dot, for use in hostnames + private static final long L_DOT = 0x400000000000L; // lowMask("."); + private static final long H_DOT = 0x0L; // highMask("."); + + // userinfo = *( unreserved | escaped | + // ";" | ":" | "&" | "=" | "+" | "$" | "," ) + private static final long L_USERINFO + = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask(";:&=+$,"); + private static final long H_USERINFO + = H_UNRESERVED | H_ESCAPED; // | highMask(";:&=+$,") == 0L; + + // reg_name = 1*( unreserved | escaped | "$" | "," | + // ";" | ":" | "@" | "&" | "=" | "+" ) + private static final long L_REG_NAME + = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask("$,;:@&=+"); + private static final long H_REG_NAME + = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask("$,;:@&=+"); + + // All valid characters for server-based authorities + private static final long L_SERVER + = L_USERINFO | L_ALPHANUM | L_DASH | 0x400400000000000L; // lowMask(".:@[]"); + private static final long H_SERVER + = H_USERINFO | H_ALPHANUM | H_DASH | 0x28000001L; // highMask(".:@[]"); + + // Special case of server authority that represents an IPv6 address + // In this case, a % does not signify an escape sequence + private static final long L_SERVER_PERCENT + = L_SERVER | 0x2000000000L; // lowMask("%"); + private static final long H_SERVER_PERCENT + = H_SERVER; // | highMask("%") == 0L; + + // scheme = alpha *( alpha | digit | "+" | "-" | "." ) + private static final long L_SCHEME = L_ALPHA | L_DIGIT | 0x680000000000L; // lowMask("+-."); + private static final long H_SCHEME = H_ALPHA | H_DIGIT; // | highMask("+-.") == 0L + + // scope_id = alpha | digit | "_" | "." + private static final long L_SCOPE_ID + = L_ALPHANUM | 0x400000000000L; // lowMask("_."); + private static final long H_SCOPE_ID + = H_ALPHANUM | 0x80000000L; // highMask("_."); + + // -- Escaping and encoding -- + + private static final char[] hexDigits = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' + }; + + private static void appendEscape(StringBuilder sb, byte b) { + sb.append('%'); + sb.append(hexDigits[(b >> 4) & 0x0f]); + sb.append(hexDigits[(b >> 0) & 0x0f]); + } + + private static void appendEncoded(StringBuilder sb, char c) { + ByteBuffer bb = null; + try { + bb = ThreadLocalCoders.encoderFor("UTF-8") + .encode(CharBuffer.wrap("" + c)); + } catch (CharacterCodingException x) { + assert false; + } + while (bb.hasRemaining()) { + int b = bb.get() & 0xff; + if (b >= 0x80) + appendEscape(sb, (byte)b); + else + sb.append((char)b); + } + } + + // Quote any characters in s that are not permitted + // by the given mask pair + // + private static String quote(String s, long lowMask, long highMask) { + StringBuilder sb = null; + boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (c < '\u0080') { + if (!match(c, lowMask, highMask)) { + if (sb == null) { + sb = new StringBuilder(); + sb.append(s, 0, i); + } + appendEscape(sb, (byte)c); + } else { + if (sb != null) + sb.append(c); + } + } else if (allowNonASCII + && (Character.isSpaceChar(c) + || Character.isISOControl(c))) { + if (sb == null) { + sb = new StringBuilder(); + sb.append(s, 0, i); + } + appendEncoded(sb, c); + } else { + if (sb != null) + sb.append(c); + } + } + return (sb == null) ? s : sb.toString(); + } + + // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets, + // assuming that s is otherwise legal + // + private static String encode(String s) { + int n = s.length(); + if (n == 0) + return s; + + // First check whether we actually need to encode + for (int i = 0;;) { + if (s.charAt(i) >= '\u0080') + break; + if (++i >= n) + return s; + } + + String ns = Normalizer.normalize(s, Normalizer.Form.NFC); + ByteBuffer bb = null; + try { + bb = ThreadLocalCoders.encoderFor("UTF-8") + .encode(CharBuffer.wrap(ns)); + } catch (CharacterCodingException x) { + assert false; + } + + StringBuilder sb = new StringBuilder(); + while (bb.hasRemaining()) { + int b = bb.get() & 0xff; + if (b >= 0x80) + appendEscape(sb, (byte)b); + else + sb.append((char)b); + } + return sb.toString(); + } + + private static int decode(char c) { + if ((c >= '0') && (c <= '9')) + return c - '0'; + if ((c >= 'a') && (c <= 'f')) + return c - 'a' + 10; + if ((c >= 'A') && (c <= 'F')) + return c - 'A' + 10; + assert false; + return -1; + } + + private static byte decode(char c1, char c2) { + return (byte)( ((decode(c1) & 0xf) << 4) + | ((decode(c2) & 0xf) << 0)); + } + + // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes + // that escapes are well-formed syntactically, i.e., of the form %XX. If a + // sequence of escaped octets is not valid UTF-8 then the erroneous octets + // are replaced with '\uFFFD'. + // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal + // with a scope_id + // + private static String decode(String s) { + return decode(s, true); + } + + // This method was introduced as a generalization of URI.decode method + // to provide a fix for JDK-8037396 + private static String decode(String s, boolean ignorePercentInBrackets) { + if (s == null) + return s; + int n = s.length(); + if (n == 0) + return s; + if (s.indexOf('%') < 0) + return s; + + StringBuilder sb = new StringBuilder(n); + ByteBuffer bb = ByteBuffer.allocate(n); + CharBuffer cb = CharBuffer.allocate(n); + CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8") + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + + // This is not horribly efficient, but it will do for now + char c = s.charAt(0); + boolean betweenBrackets = false; + + for (int i = 0; i < n;) { + assert c == s.charAt(i); // Loop invariant + if (c == '[') { + betweenBrackets = true; + } else if (betweenBrackets && c == ']') { + betweenBrackets = false; + } + if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) { + sb.append(c); + if (++i >= n) + break; + c = s.charAt(i); + continue; + } + bb.clear(); + int ui = i; + for (;;) { + assert (n - i >= 2); + bb.put(decode(s.charAt(++i), s.charAt(++i))); + if (++i >= n) + break; + c = s.charAt(i); + if (c != '%') + break; + } + bb.flip(); + cb.clear(); + dec.reset(); + CoderResult cr = dec.decode(bb, cb, true); + assert cr.isUnderflow(); + cr = dec.flush(cb); + assert cr.isUnderflow(); + sb.append(cb.flip().toString()); + } + + return sb.toString(); + } + + + // -- Parsing -- + + // For convenience we wrap the input URI string in a new instance of the + // following internal class. This saves always having to pass the input + // string as an argument to each internal scan/parse method. + + private class Parser { + + private String input; // URI input string + private boolean requireServerAuthority = false; + + Parser(String s) { + input = s; + string = s; + } + + // -- Methods for throwing URISyntaxException in various ways -- + + private void fail(String reason) throws URISyntaxException { + throw new URISyntaxException(input, reason); + } + + private void fail(String reason, int p) throws URISyntaxException { + throw new URISyntaxException(input, reason, p); + } + + private void failExpecting(String expected, int p) + throws URISyntaxException + { + fail("Expected " + expected, p); + } + + + // -- Simple access to the input string -- + + // Tells whether start < end and, if so, whether charAt(start) == c + // + private boolean at(int start, int end, char c) { + return (start < end) && (input.charAt(start) == c); + } + + // Tells whether start + s.length() < end and, if so, + // whether the chars at the start position match s exactly + // + private boolean at(int start, int end, String s) { + int p = start; + int sn = s.length(); + if (sn > end - p) + return false; + int i = 0; + while (i < sn) { + if (input.charAt(p++) != s.charAt(i)) { + break; + } + i++; + } + return (i == sn); + } + + + // -- Scanning -- + + // The various scan and parse methods that follow use a uniform + // convention of taking the current start position and end index as + // their first two arguments. The start is inclusive while the end is + // exclusive, just as in the String class, i.e., a start/end pair + // denotes the left-open interval [start, end) of the input string. + // + // These methods never proceed past the end position. They may return + // -1 to indicate outright failure, but more often they simply return + // the position of the first char after the last char scanned. Thus + // a typical idiom is + // + // int p = start; + // int q = scan(p, end, ...); + // if (q > p) + // // We scanned something + // ...; + // else if (q == p) + // // We scanned nothing + // ...; + // else if (q == -1) + // // Something went wrong + // ...; + + + // Scan a specific char: If the char at the given start position is + // equal to c, return the index of the next char; otherwise, return the + // start position. + // + private int scan(int start, int end, char c) { + if ((start < end) && (input.charAt(start) == c)) + return start + 1; + return start; + } + + // Scan forward from the given start position. Stop at the first char + // in the err string (in which case -1 is returned), or the first char + // in the stop string (in which case the index of the preceding char is + // returned), or the end of the input string (in which case the length + // of the input string is returned). May return the start position if + // nothing matches. + // + private int scan(int start, int end, String err, String stop) { + int p = start; + while (p < end) { + char c = input.charAt(p); + if (err.indexOf(c) >= 0) + return -1; + if (stop.indexOf(c) >= 0) + break; + p++; + } + return p; + } + + // Scan forward from the given start position. Stop at the first char + // in the stop string (in which case the index of the preceding char is + // returned), or the end of the input string (in which case the length + // of the input string is returned). May return the start position if + // nothing matches. + // + private int scan(int start, int end, String stop) { + int p = start; + while (p < end) { + char c = input.charAt(p); + if (stop.indexOf(c) >= 0) + break; + p++; + } + return p; + } + + // Scan a potential escape sequence, starting at the given position, + // with the given first char (i.e., charAt(start) == c). + // + // This method assumes that if escapes are allowed then visible + // non-US-ASCII chars are also allowed. + // + private int scanEscape(int start, int n, char first) + throws URISyntaxException + { + int p = start; + char c = first; + if (c == '%') { + // Process escape pair + if ((p + 3 <= n) + && match(input.charAt(p + 1), L_HEX, H_HEX) + && match(input.charAt(p + 2), L_HEX, H_HEX)) { + return p + 3; + } + fail("Malformed escape pair", p); + } else if ((c > 128) + && !Character.isSpaceChar(c) + && !Character.isISOControl(c)) { + // Allow unescaped but visible non-US-ASCII chars + return p + 1; + } + return p; + } + + // Scan chars that match the given mask pair + // + private int scan(int start, int n, long lowMask, long highMask) + throws URISyntaxException + { + int p = start; + while (p < n) { + char c = input.charAt(p); + if (match(c, lowMask, highMask)) { + p++; + continue; + } + if ((lowMask & L_ESCAPED) != 0) { + int q = scanEscape(p, n, c); + if (q > p) { + p = q; + continue; + } + } + break; + } + return p; + } + + // Check that each of the chars in [start, end) matches the given mask + // + private void checkChars(int start, int end, + long lowMask, long highMask, + String what) + throws URISyntaxException + { + int p = scan(start, end, lowMask, highMask); + if (p < end) + fail("Illegal character in " + what, p); + } + + // Check that the char at position p matches the given mask + // + private void checkChar(int p, + long lowMask, long highMask, + String what) + throws URISyntaxException + { + checkChars(p, p + 1, lowMask, highMask, what); + } + + + // -- Parsing -- + + // [:][#] + // + void parse(boolean rsa) throws URISyntaxException { + requireServerAuthority = rsa; + int n = input.length(); + int p = scan(0, n, "/?#", ":"); + if ((p >= 0) && at(p, n, ':')) { + if (p == 0) + failExpecting("scheme name", 0); + checkChar(0, L_ALPHA, H_ALPHA, "scheme name"); + checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name"); + scheme = input.substring(0, p); + p++; // Skip ':' + if (at(p, n, '/')) { + p = parseHierarchical(p, n); + } else { + // opaque; need to create the schemeSpecificPart + int q = scan(p, n, "#"); + if (q <= p) + failExpecting("scheme-specific part", p); + checkChars(p, q, L_URIC, H_URIC, "opaque part"); + schemeSpecificPart = input.substring(p, q); + p = q; + } + } else { + p = parseHierarchical(0, n); + } + if (at(p, n, '#')) { + checkChars(p + 1, n, L_URIC, H_URIC, "fragment"); + fragment = input.substring(p + 1, n); + p = n; + } + if (p < n) + fail("end of URI", p); + } + + // [//authority][?] + // + // DEVIATION from RFC2396: We allow an empty authority component as + // long as it's followed by a non-empty path, query component, or + // fragment component. This is so that URIs such as "file:///foo/bar" + // will parse. This seems to be the intent of RFC2396, though the + // grammar does not permit it. If the authority is empty then the + // userInfo, host, and port components are undefined. + // + // DEVIATION from RFC2396: We allow empty relative paths. This seems + // to be the intent of RFC2396, but the grammar does not permit it. + // The primary consequence of this deviation is that "#f" parses as a + // relative URI with an empty path. + // + private int parseHierarchical(int start, int n) + throws URISyntaxException + { + int p = start; + if (at(p, n, '/') && at(p + 1, n, '/')) { + p += 2; + int q = scan(p, n, "/?#"); + if (q > p) { + p = parseAuthority(p, q); + } else if (q < n) { + // DEVIATION: Allow empty authority prior to non-empty + // path, query component or fragment identifier + } else + failExpecting("authority", p); + } + int q = scan(p, n, "?#"); // DEVIATION: May be empty + checkChars(p, q, L_PATH, H_PATH, "path"); + path = input.substring(p, q); + p = q; + if (at(p, n, '?')) { + p++; + q = scan(p, n, "#"); + checkChars(p, q, L_URIC, H_URIC, "query"); + query = input.substring(p, q); + p = q; + } + return p; + } + + // authority = server | reg_name + // + // Ambiguity: An authority that is a registry name rather than a server + // might have a prefix that parses as a server. We use the fact that + // the authority component is always followed by '/' or the end of the + // input string to resolve this: If the complete authority did not + // parse as a server then we try to parse it as a registry name. + // + private int parseAuthority(int start, int n) + throws URISyntaxException + { + int p = start; + int q = p; + URISyntaxException ex = null; + + boolean serverChars; + boolean regChars; + + if (scan(p, n, "]") > p) { + // contains a literal IPv6 address, therefore % is allowed + serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n); + } else { + serverChars = (scan(p, n, L_SERVER, H_SERVER) == n); + } + regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n); + + if (regChars && !serverChars) { + // Must be a registry-based authority + authority = input.substring(p, n); + return n; + } + + if (serverChars) { + // Might be (probably is) a server-based authority, so attempt + // to parse it as such. If the attempt fails, try to treat it + // as a registry-based authority. + try { + q = parseServer(p, n); + if (q < n) + failExpecting("end of authority", q); + authority = input.substring(p, n); + } catch (URISyntaxException x) { + // Undo results of failed parse + userInfo = null; + host = null; + port = -1; + if (requireServerAuthority) { + // If we're insisting upon a server-based authority, + // then just re-throw the exception + throw x; + } else { + // Save the exception in case it doesn't parse as a + // registry either + ex = x; + q = p; + } + } + } + + if (q < n) { + if (regChars) { + // Registry-based authority + authority = input.substring(p, n); + } else if (ex != null) { + // Re-throw exception; it was probably due to + // a malformed IPv6 address + throw ex; + } else { + fail("Illegal character in authority", q); + } + } + + return n; + } + + + // [@][:] + // + private int parseServer(int start, int n) + throws URISyntaxException + { + int p = start; + int q; + + // userinfo + q = scan(p, n, "/?#", "@"); + if ((q >= p) && at(q, n, '@')) { + checkChars(p, q, L_USERINFO, H_USERINFO, "user info"); + userInfo = input.substring(p, q); + p = q + 1; // Skip '@' + } + + // hostname, IPv4 address, or IPv6 address + if (at(p, n, '[')) { + // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732 + p++; + q = scan(p, n, "/?#", "]"); + if ((q > p) && at(q, n, ']')) { + // look for a "%" scope id + int r = scan (p, q, "%"); + if (r > p) { + parseIPv6Reference(p, r); + if (r+1 == q) { + fail ("scope id expected"); + } + checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID, + "scope id"); + } else { + parseIPv6Reference(p, q); + } + host = input.substring(p-1, q+1); + p = q + 1; + } else { + failExpecting("closing bracket for IPv6 address", q); + } + } else { + q = parseIPv4Address(p, n); + if (q <= p) + q = parseHostname(p, n); + p = q; + } + + // port + if (at(p, n, ':')) { + p++; + q = scan(p, n, "/"); + if (q > p) { + checkChars(p, q, L_DIGIT, H_DIGIT, "port number"); + try { + port = Integer.parseInt(input, p, q, 10); + } catch (NumberFormatException x) { + fail("Malformed port number", p); + } + p = q; + } + } + if (p < n) + failExpecting("port number", p); + + return p; + } + + // Scan a string of decimal digits whose value fits in a byte + // + private int scanByte(int start, int n) + throws URISyntaxException + { + int p = start; + int q = scan(p, n, L_DIGIT, H_DIGIT); + if (q <= p) return q; + if (Integer.parseInt(input, p, q, 10) > 255) return p; + return q; + } + + // Scan an IPv4 address. + // + // If the strict argument is true then we require that the given + // interval contain nothing besides an IPv4 address; if it is false + // then we only require that it start with an IPv4 address. + // + // If the interval does not contain or start with (depending upon the + // strict argument) a legal IPv4 address characters then we return -1 + // immediately; otherwise we insist that these characters parse as a + // legal IPv4 address and throw an exception on failure. + // + // We assume that any string of decimal digits and dots must be an IPv4 + // address. It won't parse as a hostname anyway, so making that + // assumption here allows more meaningful exceptions to be thrown. + // + private int scanIPv4Address(int start, int n, boolean strict) + throws URISyntaxException + { + int p = start; + int q; + int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT); + if ((m <= p) || (strict && (m != n))) + return -1; + for (;;) { + // Per RFC2732: At most three digits per byte + // Further constraint: Each element fits in a byte + if ((q = scanByte(p, m)) <= p) break; p = q; + if ((q = scan(p, m, '.')) <= p) break; p = q; + if ((q = scanByte(p, m)) <= p) break; p = q; + if ((q = scan(p, m, '.')) <= p) break; p = q; + if ((q = scanByte(p, m)) <= p) break; p = q; + if ((q = scan(p, m, '.')) <= p) break; p = q; + if ((q = scanByte(p, m)) <= p) break; p = q; + if (q < m) break; + return q; + } + fail("Malformed IPv4 address", q); + return -1; + } + + // Take an IPv4 address: Throw an exception if the given interval + // contains anything except an IPv4 address + // + private int takeIPv4Address(int start, int n, String expected) + throws URISyntaxException + { + int p = scanIPv4Address(start, n, true); + if (p <= start) + failExpecting(expected, start); + return p; + } + + // Attempt to parse an IPv4 address, returning -1 on failure but + // allowing the given interval to contain [:] after + // the IPv4 address. + // + private int parseIPv4Address(int start, int n) { + int p; + + try { + p = scanIPv4Address(start, n, false); + } catch (URISyntaxException x) { + return -1; + } catch (NumberFormatException nfe) { + return -1; + } + + if (p > start && p < n) { + // IPv4 address is followed by something - check that + // it's a ":" as this is the only valid character to + // follow an address. + if (input.charAt(p) != ':') { + p = -1; + } + } + + if (p > start) + host = input.substring(start, p); + + return p; + } + + // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ] + // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum + // toplabel = alpha | alpha *( alphanum | "-" ) alphanum + // + private int parseHostname(int start, int n) + throws URISyntaxException + { + int p = start; + int q; + int l = -1; // Start of last parsed label + + do { + // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ] + q = scan(p, n, L_ALPHANUM, H_ALPHANUM); + if (q <= p) + break; + l = p; + if (q > p) { + p = q; + q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH); + if (q > p) { + if (input.charAt(q - 1) == '-') + fail("Illegal character in hostname", q - 1); + p = q; + } + } + q = scan(p, n, '.'); + if (q <= p) + break; + p = q; + } while (p < n); + + if ((p < n) && !at(p, n, ':')) + fail("Illegal character in hostname", p); + + if (l < 0) + failExpecting("hostname", start); + + // for a fully qualified hostname check that the rightmost + // label starts with an alpha character. + if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) { + fail("Illegal character in hostname", l); + } + + host = input.substring(start, p); + return p; + } + + + // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture + // + // Bug: The grammar in RFC2373 Appendix B does not allow addresses of + // the form ::12.34.56.78, which are clearly shown in the examples + // earlier in the document. Here is the original grammar: + // + // IPv6address = hexpart [ ":" IPv4address ] + // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] + // hexseq = hex4 *( ":" hex4) + // hex4 = 1*4HEXDIG + // + // We therefore use the following revised grammar: + // + // IPv6address = hexseq [ ":" IPv4address ] + // | hexseq [ "::" [ hexpost ] ] + // | "::" [ hexpost ] + // hexpost = hexseq | hexseq ":" IPv4address | IPv4address + // hexseq = hex4 *( ":" hex4) + // hex4 = 1*4HEXDIG + // + // This covers all and only the following cases: + // + // hexseq + // hexseq : IPv4address + // hexseq :: + // hexseq :: hexseq + // hexseq :: hexseq : IPv4address + // hexseq :: IPv4address + // :: hexseq + // :: hexseq : IPv4address + // :: IPv4address + // :: + // + // Additionally we constrain the IPv6 address as follows :- + // + // i. IPv6 addresses without compressed zeros should contain + // exactly 16 bytes. + // + // ii. IPv6 addresses with compressed zeros should contain + // less than 16 bytes. + + private int ipv6byteCount = 0; + + private int parseIPv6Reference(int start, int n) + throws URISyntaxException + { + int p = start; + int q; + boolean compressedZeros = false; + + q = scanHexSeq(p, n); + + if (q > p) { + p = q; + if (at(p, n, "::")) { + compressedZeros = true; + p = scanHexPost(p + 2, n); + } else if (at(p, n, ':')) { + p = takeIPv4Address(p + 1, n, "IPv4 address"); + ipv6byteCount += 4; + } + } else if (at(p, n, "::")) { + compressedZeros = true; + p = scanHexPost(p + 2, n); + } + if (p < n) + fail("Malformed IPv6 address", start); + if (ipv6byteCount > 16) + fail("IPv6 address too long", start); + if (!compressedZeros && ipv6byteCount < 16) + fail("IPv6 address too short", start); + if (compressedZeros && ipv6byteCount == 16) + fail("Malformed IPv6 address", start); + + return p; + } + + private int scanHexPost(int start, int n) + throws URISyntaxException + { + int p = start; + int q; + + if (p == n) + return p; + + q = scanHexSeq(p, n); + if (q > p) { + p = q; + if (at(p, n, ':')) { + p++; + p = takeIPv4Address(p, n, "hex digits or IPv4 address"); + ipv6byteCount += 4; + } + } else { + p = takeIPv4Address(p, n, "hex digits or IPv4 address"); + ipv6byteCount += 4; + } + return p; + } + + // Scan a hex sequence; return -1 if one could not be scanned + // + private int scanHexSeq(int start, int n) + throws URISyntaxException + { + int p = start; + int q; + + q = scan(p, n, L_HEX, H_HEX); + if (q <= p) + return -1; + if (at(q, n, '.')) // Beginning of IPv4 address + return -1; + if (q > p + 4) + fail("IPv6 hexadecimal digit sequence too long", p); + ipv6byteCount += 2; + p = q; + while (p < n) { + if (!at(p, n, ':')) + break; + if (at(p + 1, n, ':')) + break; // "::" + p++; + q = scan(p, n, L_HEX, H_HEX); + if (q <= p) + failExpecting("digits for an IPv6 address", p); + if (at(q, n, '.')) { // Beginning of IPv4 address + p--; + break; + } + if (q > p + 4) + fail("IPv6 hexadecimal digit sequence too long", p); + ipv6byteCount += 2; + p = q; + } + + return p; + } + + } +} diff --git a/net-resource/src/test/java/org/xbib/net/resource/BuilderTest.java b/net-resource/src/test/java/org/xbib/net/resource/BuilderTest.java new file mode 100644 index 0000000..3bfb4ee --- /dev/null +++ b/net-resource/src/test/java/org/xbib/net/resource/BuilderTest.java @@ -0,0 +1,1074 @@ +package org.xbib.net.resource; +/* + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + */ + +import java.net.URISyntaxException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Stream; + +public class BuilderTest { + + static final boolean DEBUG = false; // additional verbose info if true + + // The component constants are defined in the main class rather than + // in the component class for conciseness purposes. + public static final Component SCHEME = + new Component<>("scheme", IRI::getScheme, IRI::getScheme, IRI.Builder::scheme); + public static final Component AUTHORITY = + new Component<>("authority", IRI::getRawAuthority, IRI::getAuthority, IRI.Builder::authority, + null, "host", "user", "port"); + public static final Component USER = + new Component<>("user", IRI::getRawUserInfo, IRI::getUserInfo, IRI.Builder::userinfo, + null, "authority"); + public static final Component HOST = + new Component<>("host", IRI::getRawHostString, IRI::getHostString, IRI.Builder::host, + null, "authority"); + public static final Component PORT = + new Component("port", IRI::getPort, IRI::getPort, IRI.Builder::port, + -1, "authority"); + public static final Component PATH = + new Component<>("path", IRI::getRawPath, IRI::getPath, IRI.Builder::path, + null, "opaque"); + // Opaque is defined as having path as an alias, because if opaque was supplied then + // IRI::getPath will return it. We also have special getter so that OPAQUE::get will + // return null if the IRI is not opaque. + public static final Component OPAQUE = + new Component<>("opaque", PATH, + Component::rawOpaque, Component::opaque, IRI.Builder::opaque, + null, "authority", "host", "user", "port", "path"); + public static final Component QUERY = + new Component<>("query", IRI::getRawQuery, IRI::getQuery, IRI.Builder::query); + public static final Component FRAGMENT = + new Component<>("fragment", IRI::getRawFragment, IRI::getFragment, IRI.Builder::fragment); + + public static Test test(long id) { return new Test(id); } + public static Test test(long id, String iri) { + return new Test(id, () -> IRI.of(iri).with(IRI.Builder.DEFAULT_CAPABILITY)); + } + public static Test quoting(long id) { return new Test(id, BuilderTest::quotingBuilder); } + public static Test quoting(long id, String iri) { + return new Test(id, () -> IRI.of(iri).with(IRI.Builder.QUOTE_ENCODED_CAPABILITY)); + } + public static IRI.Builder quotingBuilder() { + return IRI.newBuilder(IRI.Builder.QUOTE_ENCODED_CAPABILITY); + } + + static final Test[] positive = { + test(0).set(SCHEME, "s").set(HOST, "h").set(PATH, "/p") + .check(AUTHORITY, "h") + .expect("s://h/p"), + test(1).set(SCHEME, "s").set(HOST, "h").set(PATH, "/p") + .set(QUERY, "").set(FRAGMENT, "") + .check(AUTHORITY, "h") + .expect("s://h/p?#"), + test(2).set(SCHEME, "s").set(HOST, "h").set(USER, "u").set(PORT, 42).set(PATH, "/p") + .set(QUERY, "q").set(FRAGMENT, "f") + .check(AUTHORITY, "u@h:42") + .expect("s://u@h:42/p?q#f"), + test(3).set(SCHEME, "s").set(HOST, "h").set(USER, "u").set(PORT, 42).set(PATH, "/p") + .check(AUTHORITY, "u@h:42") + .expect("s://u@h:42/p"), + test(4).set(SCHEME, "s").set(HOST, "h").set(PATH, "/p") + .set(AUTHORITY, "ha") // erase host, user, port + .check(HOST, "ha") + .expect("s://ha/p"), // authority has 'replaced' host + test(5).set(SCHEME, "s").set(HOST, "h").set(PATH, "/p") + .set(USER, "u").set(PORT, 0) + .set(AUTHORITY, "ha") // erase host, user, port + .check(HOST, "ha") // authority has 'replaced' host + .expect("s://ha/p"), + test(6).set(SCHEME, "s").set(HOST, "h").set(PATH, "/p") + .set(AUTHORITY, "u@ha:0") // erase host, user, port + .check(HOST, "ha") // authority has 'replaced' host + .check(PORT, 0) // authority has 'replaced' port + .check(USER, "u") // authority has 'replaced' user + .expect("s://u@ha:0/p"), + test(7).set(HOST, "h").set(SCHEME, "s").set(PATH, "/p").set(AUTHORITY, "u@ha:0") + .set(OPAQUE, "blah") // erase host, port, user, authority, path + .expect("s:blah"), + test(8).set(HOST, "h").set(SCHEME, "s").set(PATH, "/p").set(AUTHORITY, "u@ha:0") + .set(OPAQUE, "blah") // erase host, port, user, authority, path + .check(PATH, "blah") // needless: should already be tested... + .expect("s:blah"), + test(9).set(SCHEME, "s").set(HOST, "h").set(USER, "I").set(PORT, 42).set(PATH, "/p") + .set(AUTHORITY, "u@ha:0") // erase host, user, port + .check(HOST, "ha") // authority has 'replaced' host + .check(PORT, 0) // authority has 'replaced' port + .check(USER, "u") // authority has 'replaced' user + .expect("s://u@ha:0/p"), + test(10).set(SCHEME, "s1").set(HOST, "h").set(USER, "I").set(PORT, 42).set(PATH, "/p") + .set(SCHEME, "s").set(HOST, "h1") + .check(AUTHORITY, "I@h1:42") + .expect("s://I@h1:42/p"), + test(11).set(SCHEME, "s1").set(HOST, "h").set(USER, "I").set(PORT, 42).set(PATH, "/p") + .set(SCHEME, "s").set(HOST, "h1").set(PORT, -1) + .check(AUTHORITY, "I@h1") + .expect("s://I@h1/p"), + test(12).set(SCHEME, "s1").set(HOST, "h").set(USER, "I").set(PORT, 42).set(PATH, "/p") + .set(QUERY, null) + .set(SCHEME, "s").set(HOST, "h1").set(PORT, -1) + .check(AUTHORITY, "I@h1") + .expect("s://I@h1/p"), + test(13).set(SCHEME, "s1").set(HOST, "h").set(USER, "I").set(PORT, 42).set(PATH, "/p") + .set(QUERY, "q") + .set(SCHEME, "s").set(HOST, "h1").set(PORT, -1) + .check(AUTHORITY, "I@h1") + .expect("s://I@h1/p?q"), + test(14).set(SCHEME, "s1").set(HOST, "h").set(USER, "I").set(PORT, 42).set(PATH, "/p") + .set(QUERY, "q") + .set(SCHEME, "s").set(HOST, "h1").set(PORT, -1) + .set(QUERY, null).set(FRAGMENT, "f") + .check(AUTHORITY, "I@h1") + .expect("s://I@h1/p#f"), + test(15).set(SCHEME, "s1").set(HOST, "h").set(USER, "I").set(PORT, 42).set(PATH, "/p") + .set(QUERY, "q").set(FRAGMENT, "f") + .set(SCHEME, "s").set(HOST, "h1").set(PORT, -1) + .set(QUERY, null).set(FRAGMENT, null) + .check(AUTHORITY, "I@h1") + .expect("s://I@h1/p"), + test(16).set(SCHEME, "s").set(HOST, "h").set(USER, "u").set(PORT, 42).set(PATH, "/p") + .set(QUERY, "q").set(FRAGMENT, "f") + .set(SCHEME, null).set(HOST, null).set(USER, null).set(PORT, -1).set(PATH, null) + .set(QUERY, null).set(FRAGMENT, null) + .check(PATH, "") + .expect(""), + test(17).set(SCHEME, "s").set(QUERY, "q").set(FRAGMENT, "f") + .set(OPAQUE, "blah") + .expect("s:blah?q#f"), + test(18).set(SCHEME, "s").set(QUERY, "q").set(FRAGMENT, "f") + .set(OPAQUE, "blah?r") + .raw(OPAQUE, "blah%3Fr") + .expect("s:blah%3Fr?q#f") + .ti("s:blah%3Fr?q#f"), + + // encoded + test(19).set(SCHEME, "s").set(HOST, "h:%3a%42").set(USER, "u@%41").set(PORT, 42) + .set(PATH, "/p%2f%42").set(QUERY, "q[%40]q").set(FRAGMENT, "f[%42]f") + .raw(USER, "u%40%41").check(USER, "u@A") + .raw(HOST, "h%3A%3a%42").check(HOST, "h::B") + .raw(PATH, "/p%2f%42").check(PATH, "/p%2FB") + .raw(AUTHORITY, "u%40%41@h%3A%3a%42:42") + .check(AUTHORITY, "u%40A@h%3A%3AB:42") + .raw(QUERY, "q%5B%40%5Dq").check(QUERY, "q[@]q") + .raw(FRAGMENT, "f%5B%42%5Df").check(FRAGMENT, "f[B]f") + .expect("s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df"), + + // encoded: edge case in host name: %25%34%31 => %41 => A + // we check that getHost() will have %41 for the sequence %25%34%31, + // but authority and toIRIString should have %2541 - that is the % character + // will remain encoded there to avoid the %41 sequence to be further + // decoded into A at the next iteration - the preserving the toIRIString + // invariant. + test(20).set(SCHEME, "s").set(HOST, "h:%3a%25%34%31%42").set(USER, "u@%41").set(PORT, 42) + .set(PATH, "/p%2f%42").set(QUERY, "q[%40]q").set(FRAGMENT, "f[%42]f") + .raw(USER, "u%40%41").check(USER, "u@A") + .raw(HOST, "h%3A%3a%25%34%31%42").check(HOST, "h::%41B") + .raw(PATH, "/p%2f%42").check(PATH, "/p%2FB") + .raw(AUTHORITY, "u%40%41@h%3A%3a%25%34%31%42:42") + .check(AUTHORITY, "u%40A@h%3A%3A%2541B:42") + .raw(QUERY, "q%5B%40%5Dq").check(QUERY, "q[@]q") + .raw(FRAGMENT, "f%5B%42%5Df").check(FRAGMENT, "f[B]f") + .expect("s://u%40%41@h%3A%3a%25%34%31%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .ti("s://u%40A@h%3A%3A%2541B:42/p%2FB?q%5B@%5Dq#f%5BB%5Df"), + + // check invalid sequences in the middle of valid sequences. + // check that %25%34%31 translate into %2541 in the IRI string, not as %41 + test(21, "http://foo.com/with%25%34%31%C0%AF%25%34%310xC0AF%EC%82%AF\uc0af") + .check(SCHEME, "http") + .check(HOST, "foo.com") + .check(AUTHORITY, "foo.com") + .check(PATH, "/with%41%C0%AF%410xC0AF\uc0af\uc0af") + .raw(PATH, "/with%25%34%31%C0%AF%25%34%310xC0AF%EC%82%AF\uc0af") + .ti("http://foo.com/with%2541%C0%AF%25410xC0AF\uc0af\uc0af"), + + // encoded with quoting + quoting(22).set(SCHEME, "s").set(HOST, "h:%3a%42").set(USER, "u@%41").set(PORT, 42) + .set(PATH, "/p%2f%42").set(QUERY, "q[%40]q").set(FRAGMENT, "f[%42]f") + .raw(USER, "u%40%2541").check(USER, "u@%41") + .raw(HOST, "h%3A%253a%2542").check(HOST, "h:%3a%42") + .raw(PATH, "/p%252f%2542").check(PATH, "/p%2f%42") + .raw(AUTHORITY, "u%40%2541@h%3A%253a%2542:42") + .check(AUTHORITY, "u%40%2541@h%3A%253a%2542:42") + .raw(QUERY, "q%5B%2540%5Dq").check(QUERY, "q[%40]q") + .raw(FRAGMENT, "f%5B%2542%5Df").check(FRAGMENT, "f[%42]f") + .expect("s://u%40%2541@h%3A%253a%2542:42/p%252f%2542?q%5B%2540%5Dq#f%5B%2542%5Df"), + + // build from IRI, don't change any value, check nothing changed. + quoting(23,"s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .check(SCHEME, "s") + .check(PORT, 42) + .raw(USER, "u%40%41").check(USER, "u@A") + .raw(HOST, "h%3A%3a%42").check(HOST, "h::B") + .raw(PATH, "/p%2f%42").check(PATH, "/p%2FB") + .raw(AUTHORITY, "u%40%41@h%3A%3a%42:42") + .check(AUTHORITY, "u%40A@h%3A%3AB:42") + .raw(QUERY, "q%5B%40%5Dq").check(QUERY, "q[@]q") + .raw(FRAGMENT, "f%5B%42%5Df").check(FRAGMENT, "f[B]f") + .expect("s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .ti("s://u%40A@h%3A%3AB:42/p%2FB?q%5B@%5Dq#f%5BB%5Df"), + + // Same as case 23, but change host + quoting(24,"s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .set(HOST, "h:%3a%42") // change host => host will be quoted + .check(PORT, 42) + .check(SCHEME, "s") + .raw(USER, "u%40%41").check(USER, "u@A") + .raw(HOST, "h%3A%253a%2542").check(HOST, "h:%3a%42") + .raw(PATH, "/p%2f%42").check(PATH, "/p%2FB") + .raw(AUTHORITY, "u%40%41@h%3A%253a%2542:42") + .check(AUTHORITY, "u%40A@h%3A%253a%2542:42") + .raw(QUERY, "q%5B%40%5Dq").check(QUERY, "q[@]q") + .raw(FRAGMENT, "f%5B%42%5Df").check(FRAGMENT, "f[B]f") + .expect("s://u%40%41@h%3A%253a%2542:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .ti("s://u%40A@h%3A%253a%2542:42/p%2FB?q%5B@%5Dq#f%5BB%5Df"), + + // Same as case 23, but change user + quoting(25,"s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .set(USER, "u%40@%41") // change user => user will be quoted + .check(SCHEME, "s") + .check(PORT, 42) + .raw(USER, "u%2540%40%2541").check(USER, "u%40@%41") + .raw(HOST, "h%3A%3a%42").check(HOST, "h::B") + .raw(PATH, "/p%2f%42").check(PATH, "/p%2FB") + .raw(AUTHORITY, "u%2540%40%2541@h%3A%3a%42:42") + .check(AUTHORITY, "u%2540%40%2541@h%3A%3AB:42") // TODO: is that OK? + .raw(QUERY, "q%5B%40%5Dq").check(QUERY, "q[@]q") + .raw(FRAGMENT, "f%5B%42%5Df").check(FRAGMENT, "f[B]f") + .expect("s://u%2540%40%2541@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .ti("s://u%2540%40%2541@h%3A%3AB:42/p%2FB?q%5B@%5Dq#f%5BB%5Df"), + + // Same as case 23, but change path + quoting(26,"s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .set(PATH, "/p%2f%42") // change path => path will be quoted + .check(SCHEME, "s") + .check(PORT, 42) + .raw(USER, "u%40%41").check(USER, "u@A") + .raw(HOST, "h%3A%3a%42").check(HOST, "h::B") + .raw(PATH, "/p%252f%2542").check(PATH, "/p%2f%42") + .raw(AUTHORITY, "u%40%41@h%3A%3a%42:42") + .check(AUTHORITY, "u%40A@h%3A%3AB:42") + .raw(QUERY, "q%5B%40%5Dq").check(QUERY, "q[@]q") + .raw(FRAGMENT, "f%5B%42%5Df").check(FRAGMENT, "f[B]f") + .expect("s://u%40%41@h%3A%3a%42:42/p%252f%2542?q%5B%40%5Dq#f%5B%42%5Df") + .ti("s://u%40A@h%3A%3AB:42/p%252f%2542?q%5B@%5Dq#f%5BB%5Df"), + + // Same as case 23, but change query + quoting(27,"s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .set(QUERY, "q%5B[%40@%41]%5Dq") // change query => query will be quoted + .check(SCHEME, "s") + .check(PORT, 42) + .raw(USER, "u%40%41").check(USER, "u@A") + .raw(HOST, "h%3A%3a%42").check(HOST, "h::B") + .raw(PATH, "/p%2f%42").check(PATH, "/p%2FB") + .raw(AUTHORITY, "u%40%41@h%3A%3a%42:42") + .check(AUTHORITY, "u%40A@h%3A%3AB:42") + .raw(QUERY, "q%255B%5B%2540@%2541%5D%255Dq") + .check(QUERY, "q%5B[%40@%41]%5Dq") + .raw(FRAGMENT, "f%5B%42%5Df").check(FRAGMENT, "f[B]f") + .expect("s://u%40%41@h%3A%3a%42:42/p%2f%42?q%255B%5B%2540@%2541%5D%255Dq#f%5B%42%5Df") + .ti("s://u%40A@h%3A%3AB:42/p%2FB?q%255B%5B%2540@%2541%5D%255Dq#f%5BB%5Df"), + + // Same as case 23, but change fragment + quoting(28,"s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .set(FRAGMENT, "f%5B[%2541%41]%5Df") // change fragment => fragment will be quoted + .check(SCHEME, "s") + .check(PORT, 42) + .raw(USER, "u%40%41").check(USER, "u@A") + .raw(HOST, "h%3A%3a%42").check(HOST, "h::B") + .raw(PATH, "/p%2f%42").check(PATH, "/p%2FB") + .raw(AUTHORITY, "u%40%41@h%3A%3a%42:42") + .check(AUTHORITY, "u%40A@h%3A%3AB:42") + .raw(QUERY, "q%5B%40%5Dq").check(QUERY, "q[@]q") + .raw(FRAGMENT, "f%255B%5B%252541%2541%5D%255Df") + .check(FRAGMENT, "f%5B[%2541%41]%5Df") + .expect("s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%255B%5B%252541%2541%5D%255Df") + .ti("s://u%40A@h%3A%3AB:42/p%2FB?q%5B@%5Dq#f%255B%5B%252541%2541%5D%255Df"), + + // Same as case 23, but change authority + quoting(29,"s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .set(AUTHORITY, "u%40%41@h%3A%3a%42:43") // change authority => authority will be quoted + .check(PORT, 43) + .check(SCHEME, "s") + .raw(USER, "u%2540%2541").check(USER, "u%40%41") + .raw(HOST, "h%253A%253a%2542").check(HOST, "h%3A%3a%42") + .raw(PATH, "/p%2f%42").check(PATH, "/p%2FB") + .raw(AUTHORITY, "u%2540%2541@h%253A%253a%2542:43") + .check(AUTHORITY, "u%2540%2541@h%253A%253a%2542:43") + .raw(QUERY, "q%5B%40%5Dq").check(QUERY, "q[@]q") + .raw(FRAGMENT, "f%5B%42%5Df").check(FRAGMENT, "f[B]f") + .expect("s://u%2540%2541@h%253A%253a%2542:43/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .ti("s://u%2540%2541@h%253A%253a%2542:43/p%2FB?q%5B@%5Dq#f%5BB%5Df"), + + // Same as case 23, but change scheme + quoting(30,"s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .set(SCHEME, "s2") // change scheme + .check(SCHEME, "s2") + .check(PORT, 42) + .raw(USER, "u%40%41").check(USER, "u@A") + .raw(HOST, "h%3A%3a%42").check(HOST, "h::B") + .raw(PATH, "/p%2f%42").check(PATH, "/p%2FB") + .raw(AUTHORITY, "u%40%41@h%3A%3a%42:42") + .check(AUTHORITY, "u%40A@h%3A%3AB:42") + .raw(QUERY, "q%5B%40%5Dq").check(QUERY, "q[@]q") + .raw(FRAGMENT, "f%5B%42%5Df").check(FRAGMENT, "f[B]f") + .expect("s2://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .ti("s2://u%40A@h%3A%3AB:42/p%2FB?q%5B@%5Dq#f%5BB%5Df"), + + // Same as case 23, but change port + quoting(31,"s://u%40%41@h%3A%3a%42:42/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .set(PORT, 43) // change port + .check(SCHEME, "s") + .check(PORT, 43) + .raw(USER, "u%40%41").check(USER, "u@A") + .raw(HOST, "h%3A%3a%42").check(HOST, "h::B") + .raw(PATH, "/p%2f%42").check(PATH, "/p%2FB") + .raw(AUTHORITY, "u%40%41@h%3A%3a%42:43") + .check(AUTHORITY, "u%40A@h%3A%3AB:43") + .raw(QUERY, "q%5B%40%5Dq").check(QUERY, "q[@]q") + .raw(FRAGMENT, "f%5B%42%5Df").check(FRAGMENT, "f[B]f") + .expect("s://u%40%41@h%3A%3a%42:43/p%2f%42?q%5B%40%5Dq#f%5B%42%5Df") + .ti("s://u%40A@h%3A%3AB:43/p%2FB?q%5B@%5Dq#f%5BB%5Df"), + }; + + + static final Test[] negative = { + test(1).set(SCHEME, "a%b") + .expectFailure(BuilderTest::uriSyntaxException), + test(2).set(SCHEME, "a%b") + .expectUncheckedFailure(BuilderTest::asIllegalArgumentException), + test(3).set(OPAQUE, "opaque") + .expectFailure(BuilderTest::illegalArgumentException), + test(4).set(OPAQUE, "opaque") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + test(5).set(SCHEME, "s").set(PATH, "p") + .expectFailure(BuilderTest::illegalArgumentException), + test(6).set(SCHEME, "s").set(PATH, "p") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(7).set(SCHEME, "a%b") + .expectFailure(BuilderTest::uriSyntaxException), + quoting(8).set(SCHEME, "a%b") + .expectUncheckedFailure(BuilderTest::asIllegalArgumentException), + quoting(9).set(OPAQUE, "opaque") + .expectFailure(BuilderTest::illegalArgumentException), + quoting(10).set(OPAQUE, "opaque") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(11).set(SCHEME, "s").set(PATH, "p") + .expectFailure(BuilderTest::illegalArgumentException), + quoting(12).set(SCHEME, "s").set(PATH, "p") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + + // IAE in createHierarchical + test(13).set(USER, "u") + .expectFailure(BuilderTest::illegalArgumentException), + test(14).set(PORT, 0) + .expectFailure(BuilderTest::illegalArgumentException), + test(15).set(SCHEME, "s").set(USER, "u") + .expectFailure(BuilderTest::illegalArgumentException), + test(16).set(SCHEME, "s").set(PORT, 0) + .expectFailure(BuilderTest::illegalArgumentException), + test(17).set(PATH, "//p") + .expectFailure(BuilderTest::illegalArgumentException), + test(18).set(SCHEME, "s").set(PATH, "//p") + .expectFailure(BuilderTest::illegalArgumentException), + quoting(19).set(USER, "u") + .expectFailure(BuilderTest::illegalArgumentException), + quoting(20).set(PORT, 0) + .expectFailure(BuilderTest::illegalArgumentException), + quoting(21).set(SCHEME, "s").set(USER, "u") + .expectFailure(BuilderTest::illegalArgumentException), + quoting(22).set(SCHEME, "s").set(PORT, 0) + .expectFailure(BuilderTest::illegalArgumentException), + quoting(23).set(PATH, "//p") + .expectFailure(BuilderTest::illegalArgumentException), + quoting(24).set(SCHEME, "s").set(PATH, "//p") + .expectFailure(BuilderTest::illegalArgumentException), + test(25).set(USER, "u") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + test(26).set(PORT, 0) + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + test(27).set(SCHEME, "s").set(USER, "u") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + test(28).set(SCHEME, "s").set(PORT, 0) + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + test(29).set(PATH, "//p") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + test(30).set(SCHEME, "s").set(PATH, "//p") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(31).set(USER, "u") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(32).set(PORT, 0) + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(33).set(SCHEME, "s").set(USER, "u") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(34).set(SCHEME, "s").set(PORT, 0) + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(35).set(PATH, "//p") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(36).set(SCHEME, "s").set(PATH, "//p") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + + // IAE trying to stuff scheme in path + test(37).set(PATH, "s://a/b/c/d") + .expectFailure(BuilderTest::illegalArgumentException), + test(38).set(PATH, "s://a/b/c/d") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(39).set(PATH, "s://a/b/c/d") + .expectFailure(BuilderTest::illegalArgumentException), + quoting(40).set(PATH, "s://a/b/c/d") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + + // u@h:80/p + test(41).set(PATH, "u@h:80/p").set(HOST, "") + .expectFailure(BuilderTest::illegalArgumentException), + test(42).set(PATH, "u@h:80/p").set(HOST, "") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(43).set(PATH, "u@h:80/p").set(HOST, "") + .expectFailure(BuilderTest::illegalArgumentException), + quoting(44).set(PATH, "u@h:80/p").set(HOST, "") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + test(45).set(PATH, "u@h:80/p").set(AUTHORITY, "") + .expectFailure(BuilderTest::illegalArgumentException), + test(46).set(PATH, "u@h:80/p").set(AUTHORITY, "") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(47).set(PATH, "u@h:80/p").set(AUTHORITY, "") + .expectFailure(BuilderTest::illegalArgumentException), + quoting(48).set(PATH, "u@h:80/p").set(AUTHORITY, "") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + test(49).set(PATH, "u@h:80/p").set(HOST, "").set(SCHEME, "s") + .expectFailure(BuilderTest::illegalArgumentException), + test(50).set(PATH, "u@h:80/p").set(HOST, "").set(SCHEME, "s") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(51).set(PATH, "u@h:80/p").set(HOST, "").set(SCHEME, "s") + .expectFailure(BuilderTest::illegalArgumentException), + quoting(52).set(PATH, "u@h:80/p").set(HOST, "").set(SCHEME, "s") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + test(53).set(PATH, "u@h:80/p").set(AUTHORITY, "").set(SCHEME, "s") + .expectFailure(BuilderTest::illegalArgumentException), + test(54).set(PATH, "u@h:80/p").set(AUTHORITY, "").set(SCHEME, "s") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + quoting(55).set(PATH, "u@h:80/p").set(AUTHORITY, "").set(SCHEME, "s") + .expectFailure(BuilderTest::illegalArgumentException), + quoting(56).set(PATH, "u@h:80/p").set(AUTHORITY, "").set(SCHEME, "s") + .expectUncheckedFailure(BuilderTest::illegalArgumentException), + + }; + + + public static void positive() throws URISyntaxException { + for (Test test : positive) { + System.out.println("positive["+ test.id +"]: " + test.inputValues); + test.build().verify().store() + .buildUnchecked().verify().same(); + IRI iri = test.iri(); + test.expect(iri.toIRIString()).verify(); + } + } + + public static void negative() throws URISyntaxException { + for (Test test : negative) { + System.out.println("negative["+ test.id +"]: " + test.inputValues); + test.fail(); + } + } + + public static void main(String args[]) throws Exception { + identities(); + positive(); + negative(); + } + + public static void identities() throws Exception { + + URL classUrl = new URL("jrt:/java.base/java/lang/Object.class"); + + String[] uris = { + "mailto:xyz@abc.de", + "file:xyz#ab", + "http:abc/xyz/pqr", + "http:abc/xyz/pqr?id=x%0a&ca=true", + "file:/C:/v700/dev/unitTesting/tests/apiUtil/uri", + "http:///p", + "file:/C:/v700/dev/unitTesting/tests/apiUtil/uri", + "file:/C:/v700/dev%20src/unitTesting/tests/apiUtil/uri", + "file:/C:/v700/dev%20src/./unitTesting/./tests/apiUtil/uri", + "http://localhost:80/abc/./xyz/../pqr?id=x%0a&ca=true", + "file:./test/./x", + "file:./././%20#i=3", + "file:?hmm", + "file:.#hmm", + "foo", + "foo/bar", + "./foo/bar#there", + "http://foo.com/with%25%34%31%C0%AF%25%34%310xC0AF%EC%82%AF\uc0af", + classUrl.toExternalForm(), + + }; + for (String s : uris) { + System.out.println("identities: " + s); + IRI i1 = IRI.parseIRI(s); + IRI i2 = i1.with(0).build(); + IRI i3 = i1.with(0).buildUnchecked(); + if (!i1.equals(i2)) { + String msg = String.format("Identity failed for: %s\n\t built: %s", i1, i2); + System.out.println(msg); + throw new RuntimeException(msg); + } + if (!i1.equals(i3)) { + String msg = String.format("Identity failed for: %s\n\t built: %s", i1, i3); + System.out.println(msg); + throw new RuntimeException(msg); + } + if (!i1.toIRIString().equals(i2.toIRIString())) { + String msg = String.format("Identity failed for: %s\n\t built: %s", + i1.toIRIString(), i2.toIRIString()); + System.out.println(msg); + throw new RuntimeException(msg); + } + if (!i1.toIRIString().equals(i3.toIRIString())) { + String msg = String.format("Identity failed for: %s\n\t built: %s", + i1.toIRIString(), i3.toIRIString()); + System.out.println(msg); + throw new RuntimeException(msg); + } + } + + } + + // ================================================================================ // + + static RuntimeException uriSyntaxException(Throwable t) { + if (t instanceof URISyntaxException) return null; + return new RuntimeException("Expected URISyntaxException, got: " + t); + } + + static RuntimeException illegalArgumentException(Throwable t) { + if (t instanceof IllegalArgumentException) return null; + return new RuntimeException("Expected IllegalArgumentException, got: " + t); + } + + static RuntimeException asIllegalArgumentException(Throwable t) { + RuntimeException failed = illegalArgumentException(t); + if (failed != null) return failed; + return uriSyntaxException(t.getCause()); + } + + static final class Test { + final long id; + Throwable failed; + Supplier builderFactory; + IRI.Builder builder; + IRI iri; + IRI expected; + String expectedIRIString; + final List stores = new ArrayList<>(); + + // for negative tests + BuildMethod build; + String buildName; + Function checker; + + // The list of values to set on the builder. Order is important. + List> inputValues = new ArrayList<>(); + // The set of values to check on IRI. + Map, Value> valueMap = new HashMap<>(); + // The set of raw values to check on IRI. If no raw value is + // given, the raw value is assumed to be the same as the decoded + // value. + Map, RawValue> rawValueMap = new HashMap<>(); + + interface BuildMethod { + IRI build(IRI.Builder builder) throws URISyntaxException; + } + + Test() { + this(-1); + } + + Test(long id) { + this(id, IRI::newBuilder); + } + + Test(long id, Supplier builders) { + this.id = id; + builderFactory = builders; + } + + Test store() { + Test stored = new Test(id, builderFactory); + stored.inputValues = new ArrayList<>(inputValues); + stored.valueMap = new HashMap<>(valueMap); + stored.rawValueMap = new HashMap<>(rawValueMap); + stored.builder = builder; + stored.iri = iri; + stored.failed = failed; + stores.add(stored); + return this; + } + + Test set(Component component, V value) { + inputValues.add(component.input(value)); + valueMap.put(component, component.value(value)); + component.erasing().forEach(c -> valueMap.put(c, c.none())); + Component alias = component.alias(); + if (alias != null && !Objects.equals(value, component.none().get())) { + valueMap.put(alias, alias.value(value)); + } + return this; + } + + Test check(Component component, V value) { + valueMap.put(component, component.value(value)); + Component alias = component.alias(); + if (alias != null) valueMap.put(alias, alias.value(value)); + return this; + } + + Test raw(Component component, V value) { + rawValueMap.put(component, component.raw(value)); + Component alias = component.alias(); + if (alias != null) rawValueMap.put(alias, alias.raw(value)); + return this; + } + + Test builder(Supplier builder) { + this.builderFactory = builder; + return this; + } + + Test expectFailure(Function checker) { + return expectFailure("", IRI.Builder::build, checker); + } + + Test expectUncheckedFailure(Function checker) { + return expectFailure("", IRI.Builder::buildUnchecked, checker); + } + + Test expectFailure(String buildName, BuildMethod build, + Function checker) { + this.buildName = buildName; + this.build = build; + this.checker = checker; + return this; + } + + + Test fail() { + System.out.print("Building" + buildName + ": " + inputValues); + Throwable failed = null; + try { + this.builder = builderFactory.get(); + inputValues.stream().forEach(v -> v.set(builder)); + this.iri = build.build(builder); + } catch (Throwable t) { + failed = t; + } + try { + if (failed == null) { + System.out.print(" -> Should have failed: " + String.valueOf(iri)); + throw new RuntimeException("Test " + id + " " + inputValues + " should have failed!"); + } + RuntimeException checkFailed = checker.apply(failed); + if (checkFailed == null) { + System.out.print(" -> Got expected exception\n\t" + failed); + } else { + System.out.print(" -> Got unexpected exception\n\t" + failed); + throw checkFailed; + } + } finally { + System.out.println(); + } + + return this; + } + + Test build() throws URISyntaxException { + System.out.print("Building: " + inputValues); + try { + this.builder = builderFactory.get(); + inputValues.stream().forEach(v -> v.set(builder)); + this.iri = builder.build(); + } finally { + System.out.println(" -> " + (iri == null ? "Failed" : iri.toString())); + } + return this; + } + + Test buildUnchecked() { + System.out.print("Building (unchecked): " + inputValues); + try { + this.builder = builderFactory.get(); + inputValues.stream().forEach(v -> v.set(builder)); + this.iri = builder.buildUnchecked(); + } finally { + System.out.println(" -> " + (iri == null ? "Failed" : iri.toString())); + } + return this; + } + + Test verify() { + for (Component c : Component.COMPONENTS.values()) { + valueMap.putIfAbsent(c, c.none()); + rawValueMap.putIfAbsent(c, valueMap.get(c).raw()); + valueMap.get(c).check(iri); + rawValueMap.get(c).check(iri); + } + if (expected != null) { + if (!Objects.equals(iri(), expected)) { + String msg = String.format( + "IRIs differ from expected %s:\n\t expected %s\n\t actual %s", + inputValues, expected, iri()); + System.out.println(msg); + throw new RuntimeException(msg); + } + } + if (expectedIRIString != null) { + if (!Objects.equals(iri().toIRIString(), expectedIRIString)) { + String msg = String.format( + "IRI.toIRIString() differ from expected:\n\t expected %s\n\t actual %s", + expectedIRIString, iri().toIRIString()); + System.out.println(msg); + throw new RuntimeException(msg); + } + } + String toIRI; + if (!iri().toIRIString().equals(toIRI = IRI.of(iri().toIRIString()).toIRIString())) { + String msg = String.format( + "IRI.create(iri.toIRIString()).toIRIString() differ from expected:\n\t expected %s\n\t actual %s", + iri().toIRIString(), toIRI); + System.out.println(msg); + throw new RuntimeException(msg); + } + return this; + } + + Test same() { + stores.forEach(t -> { + if (!Objects.equals(iri, t.iri())) { + String msg = String.format("IRIs differ %s:\n\t %s\n\t %s", + inputValues, iri, t.iri()); + System.out.println(msg); + throw new RuntimeException(msg); + } + }); + return this; + } + + Test expect(String iri) { + return expect(IRI.of(iri)); + } + + Test expect(IRI iri) { + expected = iri; + return this; + } + + IRI iri() { + return iri; + } + + Test ti(String iriString) { + expectedIRIString = iriString; + return this; + } + } + + // A component value to be set on the Builder. + static final class InputValue { + final T value; + final String name; + final BiFunction set; + + private InputValue(T value, ValueAccessor invoker) { + this.value = value; + this.name = invoker.name(); + this.set = invoker::set; + } + + private InputValue(ValueAccessor invoker) { + this.value = null; + this.name = invoker.name(); + this.set = null; + } + + public T get() { + return value; + } + + public boolean isEmpty() { + return set == null; + } + + public IRI.Builder set(IRI.Builder builder) { + if (set == null) return builder; + return set.apply(builder, value); + } + + @Override + public String toString() { + return String.format("%s: %s", name, value); + } + + public static InputValue of(T value, ValueAccessor invoker) { + return new InputValue(value, invoker); + } + + public static InputValue empty(ValueAccessor invoker) { + return new InputValue<>(invoker); + } + + } + + // A component value to be checked against the built + // IRI. May be a decoded value (Value) or a raw + // value (RawValue). + abstract static class CheckValue { + final T value; + final Function get; + final ValueAccessor invoker; + + CheckValue(ValueAccessor invoker, T value, Function get) { + this.invoker = Objects.requireNonNull(invoker); + this.value = value; + this.get = Objects.requireNonNull(get); + } + + public final T get() { + return value; + } + + public final T get(IRI iri) { + return get.apply(iri); + } + + T check(String kind, IRI iri) { + T v = get(iri); + String n = invoker.name(); + if (!Objects.equals(v, value)) { + String msg = String.format("check failed for %s %s: expected %s, got %s", + kind, n, value, v); + System.out.println(msg); + throw new RuntimeException(msg); + } else { + if (DEBUG) { + String msg = String.format("IRI has expected %s %s: %s", kind, n, v); + System.out.println(msg); + } + } + return v; + } + + @Override + public String toString() { + return String.format("%s: %s", invoker.name(), value); + } + + RawValue raw() { + return new RawValue<>(value, invoker); + } + + Value value() { + return new Value<>(value, invoker); + } + } + + // A component raw value to be checked against the built + // IRI. + static final class RawValue extends CheckValue { + + private RawValue(T value, ValueAccessor invoker) { + super(invoker, value, invoker::getRaw); + } + + public T check(IRI iri) { + return super.check("raw", iri); + } + + @Override + public String toString() { + return String.format("raw %s: %s", invoker.name(), value); + } + + public static RawValue of(T value, ValueAccessor invoker) { + return new RawValue(value, invoker); + } + } + + // A decoded component value to be checked against the built + // IRI. + static final class Value extends CheckValue { + private Value(T value, ValueAccessor invoker) { + super(invoker, value, invoker::get); + } + + public T check(IRI iri) { + return super.check("decoded", iri); + } + + public static Value of(T value, ValueAccessor invoker) { + return new Value(value, invoker); + } + } + + // An object that provides methods to set a component + // value on a builder, and get it from an IRI. + interface ValueAccessor { + IRI.Builder set(IRI.Builder b, V v); + V get(IRI iri); + V getRaw(IRI iri); + String name(); + } + + // Models an IRI component: scheme, authority, host, user info, port, + // path, opaque path, query, fragment. + private static final class Component implements ValueAccessor { + static final Map> COMPONENTS + = new LinkedHashMap<>(); + final String name; + final Component alias; // only used for OPAQUE + final Function getRaw; + final Function get; + final BiFunction set; + final boolean isNullable; + final List erasing; + final V none; + + private Component(String name, + Function getRaw, + Function get, + BiFunction set) { + this(name, getRaw, get, set, null); + } + + private Component(String name, + Function getRaw, + Function get, + BiFunction set, + V none, + String... erasing) { + this(name, null, getRaw, get, set, none, erasing); + } + + private Component(String name, + Component alias, + Function getRaw, + Function get, + BiFunction set, + V none, + String... erasing) { + this.name = Objects.requireNonNull(name); + this.alias = alias; // only used for OPAQUE + this.getRaw = Objects.requireNonNull(getRaw, "getRaw"); + this.get = Objects.requireNonNull(get, "get");; + this.set = Objects.requireNonNull(set, "set");; + this.isNullable = (none == null); + this.none = none; + if (COMPONENTS.put(name, this) != null) { + assert false : name; + } + this.erasing = List.of(erasing); + } + + @Override + public String name() { + return name; + } + + @Override + public IRI.Builder set(IRI.Builder b, V v) { + return set.apply(b, v); + } + @Override + public V get(IRI iri) { + return get.apply(iri); + } + @Override + public V getRaw(IRI iri) { + return getRaw.apply(iri); + } + + public InputValue input(V value) { + if (!isNullable) { + Objects.requireNonNull(value); + } + return InputValue.of(value, this); + } + + public InputValue unspecified() { + return InputValue.empty(this); + } + + public Value none() { + return value(none); + } + + public RawValue noraw() { + return raw(none); + } + + public RawValue raw(V value) { + return RawValue.of(value, this); + } + + public RawValue raw(CheckValue value) { + return RawValue.of(value.get(), this); + } + + public Value value(V value) { + return Value.of(value, this); + } + + public Component alias() { + return alias; + } + + public Stream> erasing() { + return erasing.stream().map(Component::find); + } + + public static Component find(String name) { + return COMPONENTS.get(name); + } + + // special case for opaque path + static String opaque(IRI iri) { + return iri.isOpaque() ? iri.getPath() : null; + } + static String rawOpaque(IRI iri) { + return iri.isOpaque() ? iri.getRawPath() : null; + } + } + +} diff --git a/net-resource/src/test/java/org/xbib/net/resource/IRItoURLTest.java b/net-resource/src/test/java/org/xbib/net/resource/IRItoURLTest.java new file mode 100644 index 0000000..ad984be --- /dev/null +++ b/net-resource/src/test/java/org/xbib/net/resource/IRItoURLTest.java @@ -0,0 +1,186 @@ +package org.xbib.net.resource;/* + * Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 4768755 4677045 8147462 + * @summary URL.equal(URL) is inconsistent for opaque IRI.toURL() + * and new URL(IRI.toString) + * IRI.toURL() does not always work as specified + * Ensure URIs representing invalid/malformed URLs throw similar + * exception with new URL(IRI.toString()) and IRI.toURL() + */ + +import java.net.MalformedURLException; +import java.net.URL; + +public class IRItoURLTest { + + public static void main(String args[]) throws Exception { + + URL classUrl = new URL("jrt:/java.base/java/lang/Object.class"); + + String[] uris = { + "mailto:xyz@abc.de", + "file:xyz#ab", + "http:abc/xyz/pqr", + "http:abc/xyz/pqr?id=x%0a&ca=true", + "file:/C:/v700/dev/unitTesting/tests/apiUtil/uri", + "http:///p", + "file:/C:/v700/dev/unitTesting/tests/apiUtil/uri", + "file:/C:/v700/dev%20src/unitTesting/tests/apiUtil/uri", + "file:/C:/v700/dev%20src/./unitTesting/./tests/apiUtil/uri", + "http://localhost:80/abc/./xyz/../pqr?id=x%0a&ca=true", + "file:./test/./x", + "file:./././%20#i=3", + "file:?hmm", + "file:.#hmm", + classUrl.toExternalForm(), + }; + + // Strings that represent valid URIs but invalid URLs that should throw + // MalformedURLException both when calling toURL and new URL(String) + String[] malformedUrls = { + "test:/test", + "fiel:test", + }; + + // Non-absolute URIs should throw IAE when calling toURL but will throw + // MalformedURLException when calling new URL + String[] illegalUris = { + "./test", + "/test", + }; + + boolean isTestFailed = false; + boolean isURLFailed = false; + + for (String uriString : uris) { + IRI uri = IRI.of(uriString); + + URL url1 = new URL(uri.toString()); + URL url2 = uri.toURL(); + System.out.println("Testing URI " + uri); + + if (!url1.equals(url2)) { + System.out.println("equals() FAILED"); + isURLFailed = true; + } + if (url1.hashCode() != url2.hashCode()) { + System.out.println("hashCode() DIDN'T MATCH"); + isURLFailed = true; + } + if (!url1.sameFile(url2)) { + System.out.println("sameFile() FAILED"); + isURLFailed = true; + } + + if (!equalsComponents("getPath()", url1.getPath(), + url2.getPath())) { + isURLFailed = true; + } + if (!equalsComponents("getFile()", url1.getFile(), + url2.getFile())) { + isURLFailed = true; + } + if (!equalsComponents("getHost()", url1.getHost(), + url2.getHost())) { + isURLFailed = true; + } + if (!equalsComponents("getAuthority()", + url1.getAuthority(), url2.getAuthority())) { + isURLFailed = true; + } + if (!equalsComponents("getRef()", url1.getRef(), + url2.getRef())) { + isURLFailed = true; + } + if (!equalsComponents("getUserInfo()", url1.getUserInfo(), + url2.getUserInfo())) { + isURLFailed = true; + } + if (!equalsComponents("toString()", url1.toString(), + url2.toString())) { + isURLFailed = true; + } + + if (isURLFailed) { + isTestFailed = true; + } else { + System.out.println("PASSED .."); + } + System.out.println(); + isURLFailed = false; + } + for (String malformedUrl : malformedUrls) { + Exception toURLEx = null; + Exception newURLEx = null; + try { + IRI.parseIRI(malformedUrl).toURL(); + } catch (Exception e) { + // expected + toURLEx = e; + } + try { + new URL(IRI.parseIRI(malformedUrl).toString()); + } catch (Exception e) { + // expected + newURLEx = e; + } + if (!(toURLEx instanceof MalformedURLException) || + !(newURLEx instanceof MalformedURLException) || + !toURLEx.getMessage().equals(newURLEx.getMessage())) { + isTestFailed = true; + System.out.println("Expected the same MalformedURLException: " + + newURLEx + " vs " + toURLEx); + } + } + for (String illegalUri : illegalUris) { + try { + IRI.parseIRI(illegalUri).toURL(); + } catch (IllegalArgumentException e) { + // pass + } + + try { + new URL(illegalUri); + } catch (MalformedURLException e) { + // pass + } + } + if (isTestFailed) { + throw new Exception("URI.toURL() test failed"); + } + } + + static boolean equalsComponents(String method, String comp1, String comp2) { + if ((comp1 != null) && (!comp1.equals(comp2))) { + System.out.println(method + " DIDN'T MATCH" + + " ===>"); + System.out.println(" URL(URI.toString()) returns:" + comp1); + System.out.println(" URI.toURL() returns:" + comp2); + return false; + } + return true; + } +} diff --git a/net-resource/src/test/java/org/xbib/net/resource/RelativeEncoding.java b/net-resource/src/test/java/org/xbib/net/resource/RelativeEncoding.java new file mode 100644 index 0000000..f0c758e --- /dev/null +++ b/net-resource/src/test/java/org/xbib/net/resource/RelativeEncoding.java @@ -0,0 +1,61 @@ +package org.xbib.net.resource; +/* + * Copyright (c) 2003, 2019 Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/** + * @test + * @bug 4866303 + * @summary URI.resolve escapes characters in parameter URI + */ + +import java.io.File; +import java.net.URISyntaxException; +import java.util.Objects; + +public class RelativeEncoding { + public static void main(String[] args) { + try { + IRI one = IRI.parseIRI("Relative%20with%20spaces"); + // TODO: add a File.toIRI() method? + IRI two = IRI.parseIRI((new File("/tmp/dir with spaces/File with spaces")).toURI().toString()); + IRI three = two.resolve(one); + if (three.isOpaque()) + throw new RuntimeException("Bad encoding on IRI.resolve: should not be opaque"); + System.out.println(String.format("resolved path is: \"%s\" [\"%s\"]", + three.getPath(), three.getRawPath())); + checkEquals("/tmp/dir with spaces/Relative with spaces", + three.getPath(), "path"); + checkEquals("/tmp/dir%20with%20spaces/Relative%20with%20spaces", + three.getRawPath(), "raw path"); + } catch (URISyntaxException e) { + throw new RuntimeException("Unexpected exception: " + e); + } + } + + static void checkEquals(String expected, String found, String name) { + if (!Objects.equals(expected, found)) { + throw new RuntimeException( + String.format("Unexpected %s: \"%s\"\n\t expected \"%s\"", + name, found, expected)); + } + } +} diff --git a/net-resource/src/test/java/org/xbib/net/resource/Test.java b/net-resource/src/test/java/org/xbib/net/resource/Test.java new file mode 100644 index 0000000..ee0ff0f --- /dev/null +++ b/net-resource/src/test/java/org/xbib/net/resource/Test.java @@ -0,0 +1,3894 @@ +package org.xbib.net.resource; +/* + * Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* @test + * @summary Unit test for java.net.IRI + * @bug 8019345 6345502 6363889 6345551 6348515 + */ + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.PrintStream; +import java.lang.invoke.MethodHandle; +import java.lang.invoke.MethodHandles; +import java.lang.invoke.MethodType; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.net.MalformedURLException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.Consumer; +import java.util.function.IntConsumer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.IntStream; +import java.util.stream.Stream; + + +public class Test { + + static PrintStream out = System.out; + static int testCount = 0; + static final List BIDIS = List.of(new Character[] { + 0x200E, 0x200F, 0x202A, 0x202B, 0x202C, 0x202D, 0x202E + }); + private static final char[] hexDigits = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' + }; + + // Properties that we check + static final long PARSEFAIL = 1L << 0; + static final long SCHEME = 1L << 1; + static final long SSP = 1L << 2; + static final long SSP_D = 1L << 3; // Decoded form + static final long OPAQUEPART = 1L << 4; // SSP, and URI is opaque + static final long USERINFO = 1L << 5; + static final long USERINFO_D = 1L << 6; // Decoded form + static final long HOST = 1L << 7; + static final long PORT = 1L << 8; + static final long REGISTRY = 1L << 9; + static final long REGISTRY_D = 1L << 10; // Decoded form + static final long PATH = 1L << 11; + static final long PATH_D = 1L << 12; // Decoded form + static final long QUERY = 1L << 13; + static final long QUERY_D = 1L << 14; // Decoded form + static final long FRAGMENT = 1L << 15; + static final long FRAGMENT_D = 1L << 16; // Decoded form + static final long TOASCII = 1L << 17; + static final long IDENT_STR = 1L << 18; // Identities + static final long IDENT_URI1 = 1L << 19; + static final long IDENT_URI3 = 1L << 20; + static final long IDENT_URI5 = 1L << 21; + static final long IDENT_URI7 = 1L << 22; + static final long IDENT_IRI1 = 1L << 23; + static final long TOSTRING = 1L << 24; + static final long TOISTRING = 1L << 25; + static final long RSLV = 1L << 26; + static final long RTVZ = 1L << 27; + static final long IDENT_QURI = 1L << 28; + static final long IDENT_BLD1 = 1L << 29; + static final long IDENT_BLD2 = 1L << 30; + static final long IDENT_ISTR = 1L << 31; + static final long TOLSTRING = 1L << 32; + static final long IDENT_RAWO = 1L << 33; + static final long IDENT_RAW5 = 1L << 34; + static final long IDENT_RAW7 = 1L << 35; + static final long HOST_TYPE = 1L << 36; + static final long DNS_HOST = 1L << 37; + static final long IRI_OF = 1L << 38; + static final long URI_OF = 1L << 39; + + + String input; + IRI uri = null; + IRI originalURI; + IRI base = null; // Base for resolution/relativization + String op = null; // Op performed if uri != originalURI + long checked = 0; // Mask for checked properties + long failed = 0; // Mask for failed properties + Exception exc = null; + IRI invariantURI = null; + boolean checkInvariantURI; + + interface Parser { + public IRI parse(String str) throws URISyntaxException; + } + interface Input { + public String toInputString(String input, IRI iri); + } + private static String input(String input, IRI iri) { + return input; + } + private static String iri(String input, IRI iri) { + return iri == null ? input : iri.toString(); + } + + private Test(String s) { + this(s, IRI::parseIRI, Test::input); + } + + private Test(String s, Parser parser, Input input) { + testCount++; + this.input = s; + try { + uri = parser.parse(s); + } catch (URISyntaxException x) { + exc = x; + } finally { + this.input = input.toInputString(s, uri); + } + + originalURI = uri; + } + + static Test test(String s) { + return new Test(s); + } + + static Test lenient(String s) { + return new Test(s, IRI::parseLenient, Test::iri); + } + + private Test(String s, String u, String h, int n, + String p, String q, String f) + { + testCount++; + try { + uri = IRI.createHierarchical(s, u, h, n, p, q, f); + } catch (URISyntaxException x) { + exc = x; + input = x.getInput(); + } catch (IllegalArgumentException x) { + exc = x; + } + if (uri != null) + input = uri.toString(); + originalURI = uri; + } + + static Test test(String s, String u, String h, int n, + String p, String q, String f) { + return new Test(s, u, h, n, p, q, f); + } + + private Test(String s, String a, + String p, String q, String f) + { + testCount++; + try { + uri = IRI.createHierarchical(s, a, p, q, f); + } catch (URISyntaxException x) { + exc = x; + input = x.getInput(); + } catch (IllegalArgumentException x) { + exc = x; + } + if (uri != null) + input = uri.toString(); + originalURI = uri; + } + + static Test test(String s, String a, + String p, String q, String f) { + return new Test(s, a, p, q, f); + } + + private Test(String s, String h, String p, String f) { + testCount++; + try { + uri = IRI.createHierarchical(s, h, p, f); + } catch (URISyntaxException x) { + exc = x; + input = x.getInput(); + } catch (IllegalArgumentException x) { + exc = x; + } + if (uri != null) + input = uri.toString(); + originalURI = uri; + } + + static Test test(String s, String h, String p, String f) { + return new Test(s, h, p, f); + } + + private Test(String s, String ssp, String f) { + testCount++; + try { + int index = ssp == null ? -1 : ssp.indexOf('?'); + String query = index < 0 ? null : ssp.substring(index+1); + String opaque = index < 0 ? ssp : ssp.substring(0, index); + if (s == null || opaque == null || opaque.isEmpty() + || opaque.startsWith("/")) { + if (opaque != null && opaque.startsWith("//")) { + Matcher m = Pattern.compile("^//([^/]*)((/.*)?)") + .matcher(opaque); + m.matches(); + String auth = m.group(1); + String path = m.group(2); + uri = IRI.createHierarchical(s, auth, path, query, f); + } else { + uri = IRI.createHierarchical(s, null, opaque, query, f); + } + } else { + uri = IRI.createOpaque(s, opaque, query, f); + } + } catch (URISyntaxException x) { + exc = x; + input = x.getInput(); + } catch (IllegalArgumentException x) { + exc = x; + } + if (uri != null) + input = uri.toString(); + originalURI = uri; + } + + static Test test(String s, String ssp, String f) { + return new Test(s, ssp, f); + } + + private Test(String s, boolean xxx) { + testCount++; + try { + uri = IRI.of(s); + } catch (IllegalArgumentException x) { + exc = x; + } + if (uri != null) + input = uri.toString(); + originalURI = uri; + } + + static Test testCreate(String s) { + return new Test(s, false); + } + + boolean parsed() { + return uri != null; + } + + boolean resolved() { + return base != null; + } + + IRI uri() { + return uri; + } + + + // Operations on Test instances + // + // These are short so as to make test cases compact. + // + // s Scheme + // sp Scheme-specific part + // spd Scheme-specific part, decoded + // o Opaque part (isOpaque() && ssp matches) + // g reGistry (authority matches, and host is not defined) + // gd reGistry, decoded + // u User info + // ud User info, decoded + // h Host + // hd Host, decoded + // n port Number + // p Path + // pd Path, decoded + // q Query + // qd Query, decoded + // f Fragment + // fd Fragment, decoded + // + // rslv Resolve against given base + // rtvz Relativize + // psa Parse server Authority + // norm Normalize + // ta ASCII form + // + // x Check that parse failed as expected + // z End -- ensure that unchecked components are null + + private boolean check1(long prop) { + checked |= prop; + if (!parsed()) { + failed |= prop; + return false; + } + return true; + } + + private void check2(String s, String ans, long prop) { + if ((s == null) || !s.equals(ans)) + failed |= prop; + } + + private static String escape(char c) { + assert c < 0x80; + return appendEscape(new StringBuilder(), (byte)c).toString(); + } + + private static String escapeSSP(String ssp) { + return ssp == null ? null : + ssp.replace("?", escape('?')) + .replace("#", escape('#')); + } + + private static String appendSSP(IRI iri, String authority, String p, String q) { + if (iri.isOpaque() || authority == null) { + if (p != null) p = escapeSSP(p); + return (p == null ? "" : p) + + (q == null ? "" : ("?" + q)); + } else { + return "//" + escapeSSP(authority) + + (p == null ? "" : escapeSSP(p)) + + (q == null ? "" : ("?" + q)); + } + } + + // simulate SSP + // This is not strictly true as IRI::getSchemeSpecificPart() + // would not completely decode the opaque path and + // query when it contained %25hh triplets, but it's + // an approximation which is good enough for the + // purpose of this test. + private static String getSchemeSpecificPart(IRI iri) { + String authority = iri.getAuthority(); + String p = iri.getPath(); + String q = iri.getQuery(); + return appendSSP(iri, authority, p, q); + } + + // simulate raw SSP + private static String getRawSchemeSpecificPart(IRI iri) { + String authority = iri.getRawAuthority(); + String p = iri.getRawPath(); + String q = iri.getRawQuery(); + return appendSSP(iri, authority, p, q); + } + + // Allows to erase all previous checks in order + // to force all components to be rechecked. + // Useful when calling operations such as normalize or + // resolve. Use with care though. + // Example: + // test(..).s(..).h(..).p(..) // build, check (original form) + // .z().rst().norm() // report, reset, normalize + // .s(..).h(..).p(..) // recheck (normalized form) + // .z() // report again + // should be safe enough. + // + Test rst() { + // erase all checks but PARSEFAIL + checked &= PARSEFAIL; + return this; + } + + Test s(String s) { + if (check1(SCHEME)) check2(uri.getScheme(), s, SCHEME); + return this; + } + + Test u(String s) { + if (check1(USERINFO)) check2(uri.getRawUserInfo(), s, USERINFO); + return this; + } + + Test ud(String s) { + if (check1(USERINFO_D)) { + check2(uri.getUserInfo(), s, USERINFO_D); + } + return this; + } + + Test h(String s) { + if (check1(HOST)) check2(uri.getRawHostString(), s, HOST); + return this; + } + + Test hd(String s) { + if (check1(HOST)) check2(uri.getHostString(), s, HOST); + return this; + } + + Test ht(IRI.HostType type) { + if (type != IRI.getHostType(uri.getHostString())) { + failed |= HOST_TYPE; + } + if (type.isInternetName()) { + if (type != IRI.getHostType(uri.getHost())) { + failed |= HOST_TYPE; + } + } else if (IRI.HostType.None != IRI.getHostType(uri.getHost())) { + failed |= HOST_TYPE; + } + return this; + } + + Test ipv6() { + return ht(IRI.HostType.IPv6); + } + + Test ipv4() { + return ht(IRI.HostType.IPv4); + } + + Test ipvf() { + return ht(IRI.HostType.IPvFuture); + } + + Test regn() { + return ht(IRI.HostType.RegName); + } + + Test dns() { + return ht(IRI.HostType.DNSRegName); + } + + Test g(String s) { + if (check1(REGISTRY)) { + if (uri.getHostString() == null) // RFC 3986: host is reg-name + failed |= REGISTRY; + else { + check2(uri.getRawAuthority(), s, REGISTRY); + } + } + return this; + } + + Test gd(String s) { + if (check1(REGISTRY_D)) { + if (uri.getHostString() == null) // RFC 3986: host is reg-name + failed |= REGISTRY_D; + else + check2(uri.getAuthority(), s, REGISTRY_D); + } + return this; + } + + Test n(int n) { + checked |= PORT; + if (!parsed() || (uri.getPort() != n)) + failed |= PORT; + return this; + } + + Test p(String s) { + if (check1(PATH)) check2(uri.getRawPath(), s, PATH); + return this; + } + + Test pd(String s) { + if (check1(PATH_D)) check2(uri.getPath(), s, PATH_D); + return this; + } + + Test o(String s) { + if (check1(OPAQUEPART)) { + if (!uri.isOpaque()) + failed |= OPAQUEPART; + else + check2(getSchemeSpecificPart(uri), s, OPAQUEPART); + } + return this; + } + + Test sp(String s) { + if (check1(SSP)) check2(getRawSchemeSpecificPart(uri), s, SSP); + return this; + } + + Test spd(String s) { + if (check1(SSP_D)) check2(getSchemeSpecificPart(uri), s, SSP_D); + return this; + } + + Test q(String s) { + if (check1(QUERY)) check2(uri.getRawQuery(), s, QUERY); + return this; + } + + Test qd(String s) { + if (check1(QUERY_D)) check2(uri.getQuery(), s, QUERY_D); + return this; + } + + Test f(String s) { + if (check1(FRAGMENT)) check2(uri.getRawFragment(), s, FRAGMENT); + return this; + } + + Test fd(String s) { + if (check1(FRAGMENT_D)) check2(uri.getFragment(), s, FRAGMENT_D); + return this; + } + + Test ta(String s) { + if (check1(TOASCII)) + check2(uri.toASCIIString(), s, TOASCII); + return this; + } + + Test ts(String s) { + if (check1(TOSTRING)) + check2(uri.toString(), s, TOSTRING); + return this; + } + + Test ti(String s) { + if (check1(TOISTRING)) + check2(uri.toIRIString(), s, TOISTRING); + return this; + } + + Test tl(String s) { + if (check1(TOLSTRING)) + check2(uri.toLenientString(), s, TOLSTRING); + return this; + } + + Test x() { + checked |= PARSEFAIL; + if (parsed()) + failed |= PARSEFAIL; + return this; + } + + Test iae() { + checked |= PARSEFAIL; + if (parsed()) + failed |= PARSEFAIL; + else if (!IllegalArgumentException.class.isInstance(exc)) { + throw new RuntimeException("Expected IllegalArgumentException, got: " + exc, exc); + } + return this; + } + + Test use() { + checked |= PARSEFAIL; + if (parsed()) + failed |= PARSEFAIL; + else if (!URISyntaxException.class.isInstance(exc)) { + throw new RuntimeException("Expected IllegalArgumentException, got: " + exc, exc); + } + return this; + } + + Test rslv(String base) { + return rslv(IRI.of(base)); + } + + Test rslv(IRI base) { + if (!parsed()) + return this; + this.base = base; + op = "rslv"; + IRI u = uri; + uri = null; + try { + this.uri = base.resolve(u); + String up = u.normalize().getPath(); + this.invariantURI = base.relativize(uri); + if (u.getScheme() != null || + !up.startsWith("../") && + !up.equals("..") && + !mustExclude(base, u)) + { + checkInvariantURI = true; + } + } catch (IllegalArgumentException x) { + exc = x; + } + checked = 0; + failed = 0; + return this; + } + + // return true if we can't guarantee that the invariant will hold. + // tries to include as many corner cases as possible... + // + // Note1: This is the negation of this big condition: + // ((!up.startsWith("/") && + // (!up.isEmpty() || + // (u.getAuthority() == null && (base.getQuery() == null || u.getQuery() != null)))) + // || base.getScheme() == null) + // + // Note2: we could widen the condition to the negation of: + // ((!up.startsWith("/") && !up.isEmpty()) || base.getScheme() == null) + // but that would exclude too many cases from testing. + static boolean mustExclude(IRI base, IRI u) { + assert u.getScheme() == null; + String up = u.normalize().getPath(); + if (!up.startsWith("/") && !up.isEmpty() + && base.getPath().isEmpty() && base.getAuthority() != null) + return true; // if the base has an authority and no path + // a / will be prepended to the relative path + if (up.startsWith("/")) return base.getScheme() != null; + if (!up.isEmpty()) return base.getScheme() == null; + if (u.getAuthority() != null) return base.getScheme() != null; + return base.getQuery() != null && u.getQuery() == null; + } + + Test norm() { + if (!parsed()) + return this; + op = "norm"; + uri = uri.normalize(); + return this; + } + + Test rtvz(IRI base) { + if (!parsed()) + return this; + this.base = base; + op = "rtvz"; + try { + IRI u = uri; + uri = base.relativize(uri); + String up = u.normalize().getPath(); + invariantURI = base.resolve(uri); + if (u.getScheme() != null || + !up.startsWith("../") + && !up.equals("..") + && !mustExclude(base, u) + ) { + checkInvariantURI = true; + } + } catch (IllegalArgumentException x) { + exc = x; + } + checked = 0; + failed = 0; + return this; + } + +// Test psa() { +// try { +// uri.parseServerAuthority(); +// } catch (URISyntaxException x) { +// exc = x; +// uri = null; +// } +// checked = 0; +// failed = 0; +// return this; +// } + + Test psa() { + return this; + } + + private void checkEmpty(String s, long prop) { + if (((checked & prop) == 0) && (s != null)) + failed |= prop; + } + + // Check identity for the seven-argument IRI constructor + // + void checkURI7() { + // Only works on hierarchical URIs + if (uri.isOpaque()) + return; + // Not true if decoding getters return %xx triplets + // that could be further decoded. Those will be encoded + // as %25xx sequences in the IRI string + if (uri.toIRIString().matches(".*%25[0-9a-bA-B]{2}.*")) { + // out.println("Skipping IDENT_URI5 for: " + uri.toString()); + return; + } + try { + IRI u2 = IRI.createHierarchical(uri.getScheme(), uri.getUserInfo(), + uri.getHostString(), uri.getPort(), uri.getPath(), + uri.getQuery(), uri.getFragment()); + if (!uri.equals(u2)) + failed |= IDENT_URI7; + } catch (URISyntaxException x) { + if (exc == null) exc = x; + failed |= IDENT_URI7; + } + } + + // Check identity for the five-argument IRI constructor + // + void checkURI5() { + // Only works on hierarchical URIs + if (uri.isOpaque()) + return; + // Not true if decoding getters return %xx triplets + // that could be further decoded. Those will be encoded + // as %25xx sequences in the IRI string + if (uri.toIRIString().matches(".*%25[0-9a-bA-B]{2}.*")) { + // out.println("Skipping IDENT_URI5 for: " + uri.toString()); + return; + } + try { + IRI u2 = IRI.createHierarchical(uri.getScheme(), uri.getAuthority(), + uri.getPath(), uri.getQuery(), uri.getFragment()); + if (!uri.equals(u2)) + failed |= IDENT_URI5; + } catch (URISyntaxException x) { + if (exc == null) exc = x; + failed |= IDENT_URI5; + } + } + + // Check identity for the three-argument IRI constructor + // + void checkURI3() { + try { + IRI u2 = uri.isOpaque() + ? IRI.createOpaque(uri.getScheme(), + uri.getRawPath(), + uri.getRawQuery(), + uri.getRawFragment()) + : IRI.createHierarchical(uri.getScheme(), + uri.getAuthority(), + uri.getRawPath(), + uri.getRawQuery(), + uri.getRawFragment()); + if (!uri.equals(u2)) { + out.println("IDENT-URI3 failed \"" + uri +"\" != \"" + u2 + "\""); + failed |= IDENT_URI3; + } + } catch (URISyntaxException x) { + out.println("IDENT-URI3 failed for \"" + uri +"\": " + x); + if (exc == null) exc = x; + failed |= IDENT_URI3; + } + } + + void conversions() { + if (uri == null) { + try { + IRI.of(uri); + failed |= IRI_OF; + } catch (NullPointerException x) { + // OK + } + } else { + try { + if (IRI.of(uri) != uri) { + failed |= IRI_OF; + } + } catch (Exception x) { + if (exc == null) exc = x; + failed |= IRI_OF; + } + } + + /*if (uri == null) { + // + } else { + boolean expectFail = !isURI(uri); + try { + if (expectFail) { + failed |= URI_OF; + } + } catch (Exception x) { + if (!expectFail || + !(x instanceof IllegalArgumentException)) { + if (exc == null) exc = x; + failed |= URI_OF; + } + } + }*/ + } + + private static boolean isURI(IRI uri) { + String s = uri.getScheme(); + String ra = uri.getRawAuthority(); + String rp = uri.getRawPath(); + String rq = uri.getRawQuery(); + String rf = uri.getRawFragment(); + if (s != null && (ra == null || ra.isEmpty()) + && (rp == null || rp.isEmpty()) + && rq == null && rf == null) { + // "s:" or "s://" can't be converted + return false; + } + if (s == null && (ra != null && ra.isEmpty()) + && (rp == null || rp.isEmpty()) + && rq == null && rf == null) { + // "//" cannot be converted + return false; + } + if (":".equals(ra) && (rp == null || rp.isEmpty())) { + // s://: would parse with ':' as a reg-name, + // but IRI.toURI() removes the superfluous ':' + // and s:// or // do not parse. + return false; + } + String h = uri.getRawHostString(); + // IPvFuture won't parse as a URI + if (IRI.getHostType(h) == IRI.HostType.IPvFuture) { + return false; + } + return true; + } + + void checkHostType() { + + // No host means all accessors should return null. + if (IRI.getHostType(uri.getRawHostString()) == IRI.HostType.None) { + if (IRI.getHostType(uri.getHostString()) != IRI.HostType.None) { + failed |= DNS_HOST; + } + if (IRI.getHostType(uri.getHost()) != IRI.HostType.None) { + failed |= DNS_HOST; + } + } + + // If the raw host parses as an internet name then the others + // should too. + if (IRI.getHostType(uri.getRawHostString()).isInternetName()) { + if (!IRI.getHostType(uri.getHostString()).isInternetName()) { + failed |= DNS_HOST; + } + if (!IRI.getHostType(uri.getHost()).isInternetName()) { + failed |= DNS_HOST; + } + // In addition, raw host string, host string, and host + // should all be equal. + if (!uri.getRawHostString().equals(uri.getHostString())) { + failed |= DNS_HOST; + } + if (!uri.getRawHostString().equals(uri.getHost())) { + failed |= DNS_HOST; + } + } + + // if host is null then neither raw host string nor host string + // should parse as an internet name. + if (uri.getHost() == null) { + if (uri.getRawHostString() != null) { + if (IRI.getHostType(uri.getRawHostString()).isInternetName()) { + failed |= DNS_HOST; + } + if (IRI.getHostType(uri.getHostString()).isInternetName()) { + failed |= DNS_HOST; + } + } + } else { + // if host is non null then it must be an internet name. + if (!IRI.getHostType(uri.getHost()).isInternetName()) { + failed |= DNS_HOST; + } + // if host is non null then the host string must be an + // internet name too (though the raw host string might not). + if (!IRI.getHostType(uri.getHostString()).isInternetName()) { + failed |= DNS_HOST; + } + // if host is non null then it should be equal to the + // host string + if (!uri.getHost().equals(uri.getHostString())) { + failed |= DNS_HOST; + } + } + } + + // Check all identities mentioned in the IRI class specification + // (and some more) + void checkIdentities() { + if (input != null) { + if (!uri.toString().equals(input)) + failed |= IDENT_STR; + } + try { + if (!(IRI.parseIRI(uri.toString())).equals(uri)) + failed |= IDENT_URI1; + } catch (URISyntaxException x) { + if (exc == null) exc = x; + failed |= IDENT_URI1; + } + + try { + if (!(IRI.parseIRI(uri.toIRIString())).equals(uri)) + failed |= IDENT_IRI1; + } catch (URISyntaxException x) { + if (exc == null) exc = x; + failed |= IDENT_IRI1; + } + + try { + if (!(uri.with(0).build().equals(uri))) + failed |= IDENT_BLD1; + } catch (URISyntaxException x) { + if (exc == null) exc = x; + failed |= IDENT_BLD1; + } + + try { + if (!(uri.with(IRI.Builder.QUOTE_ENCODED_CAPABILITY) + .build().equals(uri))) + failed |= IDENT_BLD2; + } catch (URISyntaxException x) { + if (exc == null) exc = x; + failed |= IDENT_BLD2; + } + + try { + if (uri.isOpaque() && + !uri.equals(IRI.createOpaque(uri.getScheme(), + uri.getRawPath(), uri.getRawQuery(), + uri.getRawFragment()))) { + failed |= IDENT_RAWO; + } + } catch (URISyntaxException x) { + if (exc == null) exc = x; + failed |= IDENT_RAWO; + } + + try { + if (!uri.isOpaque() && + !uri.equals(IRI.createHierarchical(uri.getScheme(), + uri.getRawAuthority(), + uri.getRawPath(), uri.getRawQuery(), + uri.getRawFragment()))) { + failed |= IDENT_RAW5; + } + } catch (URISyntaxException x) { + if (exc == null) exc = x; + failed |= IDENT_RAW5; + } + + try { + if (!uri.isOpaque() && + !uri.equals(IRI.createHierarchical(uri.getScheme(), + uri.getRawUserInfo(), uri.getRawHostString(), uri.getPort(), + uri.getRawPath(), uri.getRawQuery(), + uri.getRawFragment()))) { + failed |= IDENT_RAW7; + } + } catch (URISyntaxException x) { + if (exc == null) exc = x; + failed |= IDENT_RAW7; + } + + try { + String iriStr1 = uri.toIRIString(); + String iriStr2 = IRI.parseIRI(iriStr1).toIRIString(); + if (!iriStr1.equals(iriStr2)) { + failed |= IDENT_ISTR; + System.out.println(String.format( + "*** IRI Strings differ for \"%s\": " + + "\n\t original: \"%s\"" + + "\n\t rebuilt: \"%s\"", uri, iriStr1, iriStr2)); + } + } catch (URISyntaxException x) { + if (exc == null) exc = x; + failed |= IDENT_ISTR; + } + + // Verifies that an IRI can be safely embedded into a query and subsequently + // extracted from the query string if it is quoted first. + try { + IRI otherU = IRI.createHierarchical("s", "h", "/p", + "uri=" + IRI.quoteEncodedOctets(uri.toString()), "f"); + IRI otherI = IRI.createHierarchical("s", "h", "/p", + "uri=" + IRI.quoteEncodedOctets(uri.toIRIString()), "f"); + IRI otherA = IRI.createHierarchical("s", "h", "/p", + "uri=" + IRI.quoteEncodedOctets(uri.toASCIIString()), "f"); + if (!uri.equals(IRI.parseIRI(otherU.getQuery().substring(4)))) { + out.println("query-ident failed for uri.toString()"); + failed |= IDENT_QURI; + } + if (!uri.equals(IRI.parseIRI(otherI.getQuery().substring(4)))) { + out.println("query-ident failed for uri.toIRIString()"); + failed |= IDENT_QURI; + } + if (!uri.equals(IRI.parseIRI(otherA.getQuery().substring(4)))) { + out.println("query-ident failed for uri.toASCIIString()"); + failed |= IDENT_QURI; + } + } catch (URISyntaxException x) { + if (exc == null) exc = x; + failed |= IDENT_QURI; + } + + // Remaining identities fail if "//" given but authority is undefined + if ((uri.getAuthority() == null) + && (getSchemeSpecificPart(uri) != null) + && (getSchemeSpecificPart(uri).startsWith("///") + || getSchemeSpecificPart(uri).startsWith("//?") + || getSchemeSpecificPart(uri).equals("//"))) + return; + + checkURI3(); + checkURI5(); + checkURI7(); + } + + // Check identities, check that unchecked component properties are not + // defined, and report any failures + // + Test z() { + conversions(); + + if (!parsed()) { + report(); + return this; + } + + if (op == null) + checkIdentities(); + + checkHostType(); + + // Check that unchecked components are undefined + checkEmpty(uri.getScheme(), SCHEME); + checkEmpty(uri.getUserInfo(), USERINFO); + checkEmpty(uri.getHostString(), HOST); + if (((checked & PORT) == 0) && (uri.getPort() != -1)) failed |= PORT; + checkEmpty(uri.getPath(), PATH); + checkEmpty(uri.getQuery(), QUERY); + checkEmpty(uri.getFragment(), FRAGMENT); + + if (invariantURI != null) { + if (checkInvariantURI && !normeq(invariantURI, originalURI)) { + switch(op) { + case "rtvz": + failed |= RTVZ; break; + case "rslv": + failed |= RSLV ; break; + default: break; + } + } + } + + // Report failures + report(); + return this; + } + + static boolean normeq(IRI u, IRI v) { + return u.normalize().equals(v.normalize()); + } + + // Summarization and reporting + + static void header(String s) { + out.println(); + out.println(); + out.println("-- " + s + " --"); + } + + static void show(String prefix, URISyntaxException x) { + out.println(uquote(x.getInput())); + if (x.getIndex() >= 0) { + for (int i = 0; i < x.getIndex(); i++) { + if (x.getInput().charAt(i) >= '\u0080') + out.print(" "); // Skip over \u1234 + else + out.print(" "); + } + out.println("^"); + } + out.println(prefix + ": " + x.getReason()); + } + + private void summarize() { + out.println(); + StringBuffer sb = new StringBuffer(); + if (input != null && input.length() == 0) + sb.append("\"\""); + else if (input != null) + sb.append(input); + else if (input == null) + sb.append("create failed"); + if (base != null) { + sb.append(" "); + sb.append(base); + } + if (!parsed()) { + String s = (((checked & PARSEFAIL) != 0) + ? "Correct exception" : "UNEXPECTED EXCEPTION"); + if (exc instanceof URISyntaxException) { + show(s, (URISyntaxException) exc); + if ((checked & PARSEFAIL) == 0) + exc.printStackTrace(out); + } else if (exc instanceof IllegalArgumentException) { + out.println(s + ": " + exc); + if ((checked & PARSEFAIL) == 0) + exc.printStackTrace(out); + } else { + out.println(uquote(sb.toString())); + out.print(s + ": "); + exc.printStackTrace(out); + } + } else { + if (op != null) { + sb.append(" "); + sb.append(op); + sb.append(" --> "); + sb.append(uri); + if (invariantURI != null) { + sb.append(" [ "); + sb.append(originalURI.normalize()); + sb.append(" <--> "); + sb.append(invariantURI.normalize()); + sb.append(" : "); + sb.append(normeq(invariantURI, originalURI)); + sb.append(" ]"); + } + } + Throwable opexc = null; + if (exc != null && (failed & (RTVZ | RSLV)) != 0) { + sb.append(" ").append(op).append(" failed: ").append(exc); + opexc = exc; + } + out.println(uquote(sb.toString())); + if (opexc != null) { + opexc.printStackTrace(out); + } + } + } + + public static String uquote(String str) { + if (str == null) + return str; + StringBuffer sb = new StringBuffer(); + int n = str.length(); + for (int i = 0; i < n; i++) { + char c = str.charAt(i); + if ((c >= ' ') && (c < 0x7f)) { + sb.append(c); + continue; + } + sb.append("\\u"); + String s = Integer.toHexString(c).toUpperCase(); + while (s.length() < 4) + s = "0" + s; + sb.append(s); + } + return sb.toString(); + } + + private static StringBuilder appendEscape(StringBuilder sb, byte b) { + sb.append('%'); + sb.append(hexDigits[(b >> 4) & 0x0f]); + sb.append(hexDigits[(b >> 0) & 0x0f]); + return sb; + } + + private static StringBuilder appendEscape(StringBuilder sb, char... c) { + for (byte b : String.valueOf(c).getBytes(StandardCharsets.UTF_8)) { + sb = appendEscape(sb, b); + } + return sb; + } + + static void show(String n, String v) { + out.println(" " + n + + " = ".substring(n.length()) + + uquote(v)); + } + + static void show(String n, String v, String vd) { + if ((v == null) || v.equals(vd)) + show(n, v); + else { + out.println(" " + n + + " = ".substring(n.length()) + + uquote(v) + + " = " + uquote(vd)); + } + } + + public static void show(IRI u) { + show("opaque", "" + u.isOpaque()); + show("scheme", u.getScheme()); + show("ssp", getRawSchemeSpecificPart(u), getSchemeSpecificPart(u)); + show("authority", u.getRawAuthority(), u.getAuthority()); + show("userinfo", u.getRawUserInfo(), u.getUserInfo()); + show("host", u.getRawHostString(), u.getHostString()); + show("dns-host", u.getHost()); + show("host-type", u.getHostType(u.getHostString()).name()); + show("port", "" + u.getPort()); + show("path", u.getRawPath(), u.getPath()); + show("query", u.getRawQuery(), u.getQuery()); + show("fragment", u.getRawFragment(), u.getFragment()); + if (!u.toString().equals(u.toASCIIString())) { + show("toascii", u.toASCIIString()); + } + if (!u.toString().equals(u.toIRIString())) { + show("toiri", u.toIRIString()); + } + if (!u.toString().equals(u.toLenientString())) { + show("tolenient", u.toLenientString()); + } + } + + private void report() { + summarize(); + if (failed == 0) return; + StringBuffer sb = new StringBuffer(); + sb.append("FAIL:"); + if ((failed & PARSEFAIL) != 0) sb.append(" parsefail"); + if ((failed & SCHEME) != 0) sb.append(" scheme"); + if ((failed & SSP) != 0) sb.append(" ssp"); + if ((failed & SSP_D) != 0) sb.append(" sspd"); + if ((failed & OPAQUEPART) != 0) sb.append(" opaquepart"); + if ((failed & USERINFO) != 0) sb.append(" userinfo"); + if ((failed & USERINFO_D) != 0) sb.append(" userinfod"); + if ((failed & HOST) != 0) sb.append(" host"); + if ((failed & PORT) != 0) sb.append(" port"); + if ((failed & REGISTRY) != 0) sb.append(" registry"); + if ((failed & REGISTRY_D) != 0) sb.append(" registryd"); + if ((failed & PATH) != 0) sb.append(" path"); + if ((failed & PATH_D) != 0) sb.append(" pathd"); + if ((failed & QUERY) != 0) sb.append(" query"); + if ((failed & QUERY_D) != 0) sb.append(" queryd"); + if ((failed & FRAGMENT) != 0) sb.append(" fragment"); + if ((failed & FRAGMENT_D) != 0) sb.append(" fragmentd"); + if ((failed & TOASCII) != 0) sb.append(" toascii"); + if ((failed & IDENT_STR) != 0) sb.append(" ident-str"); + if ((failed & IDENT_URI1) != 0) sb.append(" ident-uri1"); + if ((failed & IDENT_IRI1) != 0) sb.append(" ident-iri1"); + if ((failed & IDENT_URI3) != 0) sb.append(" ident-uri3"); + if ((failed & IDENT_URI5) != 0) sb.append(" ident-uri5"); + if ((failed & IDENT_URI7) != 0) sb.append(" ident-uri7"); + if ((failed & IDENT_QURI) != 0) sb.append(" ident-query"); + if ((failed & IDENT_BLD1) != 0) sb.append(" ident-build1"); + if ((failed & IDENT_BLD2) != 0) sb.append(" ident-build2"); + if ((failed & IDENT_ISTR) != 0) sb.append(" ident-istring"); + if ((failed & IDENT_RAWO) != 0) sb.append(" ident-raw-opaque"); + if ((failed & IDENT_RAW5) != 0) sb.append(" ident-raw-5"); + if ((failed & IDENT_RAW7) != 0) sb.append(" ident-raw-7"); + if ((failed & TOSTRING) != 0) sb.append(" tostring"); + if ((failed & TOISTRING) != 0) sb.append(" iristring"); + if ((failed & TOLSTRING) != 0) sb.append(" lenient"); + if ((failed & RTVZ) != 0) sb.append(" relativize"); + if ((failed & RSLV) != 0) sb.append(" resolve"); + if ((failed & HOST_TYPE) != 0) sb.append(" host-type"); + if ((failed & DNS_HOST) != 0) sb.append(" dns-host"); + if ((failed & IRI_OF) != 0) sb.append(" IRI.of"); + if ((failed & URI_OF) != 0) sb.append(" URI.of"); + out.println(sb); + if (uri != null) show(uri); + throw new RuntimeException("Test failed"); + } + + + + // -- Tests -- + + static void rfc2396() { + + + header("RFC2396: Basic examples"); + + test("ftp://ftp.is.co.za/rfc/rfc1808.txt") + .s("ftp").h("ftp.is.co.za").p("/rfc/rfc1808.txt").z(); + + test("http://www.math.uio.no/faq/compression-faq/part1.html") + .s("http").h("www.math.uio.no").p("/faq/compression-faq/part1.html").z(); + + test("mailto:mduerst@ifi.unizh.ch") + .s("mailto") + .o("mduerst@ifi.unizh.ch") + .p("mduerst@ifi.unizh.ch").z(); + + test("news:comp.infosystems.www.servers.unix") + .s("news") + .o("comp.infosystems.www.servers.unix") + .p("comp.infosystems.www.servers.unix").z(); + + test("telnet://melvyl.ucop.edu/") + .s("telnet").h("melvyl.ucop.edu").p("/").z(); + + test("http://www.w3.org/Addressing/") + .s("http").h("www.w3.org").p("/Addressing/").z(); + + test("ftp://ds.internic.net/rfc/") + .s("ftp").h("ds.internic.net").p("/rfc/").z(); + + test("http://www.ics.uci.edu/pub/ietf/uri/historical.html#WARNING") + .s("http").h("www.ics.uci.edu").p("/pub/ietf/uri/historical.html") + .f("WARNING").z(); + + test("http://www.ics.uci.edu/pub/ietf/uri/#Related") + .s("http").h("www.ics.uci.edu").p("/pub/ietf/uri/") + .f("Related").z(); + + + header("RFC2396: Normal relative-URI examples (appendix C)"); + + IRI base = (test("http://a/b/c/d;p?q") + .s("http").h("a").p("/b/c/d;p").q("q").z().uri()); + + // g:h g:h + test("g:h") + .s("g").o("h").p("h").z() + .rslv(base).s("g").o("h").p("h").z(); + + // g http://a/b/c/g + test("g") + .p("g").z() + .rslv(base).s("http").h("a").p("/b/c/g").z(); + + // ./g http://a/b/c/g + test("./g") + .p("./g").z() + .rslv(base).s("http").h("a").p("/b/c/g").z(); + + // g/ http://a/b/c/g/ + test("g/") + .p("g/").z() + .rslv(base).s("http").h("a").p("/b/c/g/").z(); + + // /g http://a/g + test("/g") + .p("/g").z() + .rslv(base).s("http").h("a").p("/g").z(); + + // //g/x/y http://g/x/y + test("//g/x/y") + .h("g").p("/x/y").z() + .rslv(base).s("http").h("g").p("/x/y").z(); + + // //g http://g + test("//g") + .h("g").p("").z() + .rslv(IRI.of("//a")).h("g").p("").z(); + + // //g http://g + test("//g") + .h("g").p("").z() + .rslv(IRI.of("//a?q")).h("g").p("").z(); + + // //g? http://g? + test("//g?") + .h("g").q("").p("").z() + .rslv(IRI.of("//a?q")).h("g").q("").p("").z(); + + // //g http://g + test("//g") + .h("g").p("").z() + .rslv(base).s("http").h("g").p("").z(); + + // ?y http://a/b/c/d;p?y as per RFC3986 + test("?y") + .p("").q("y").z() + .rslv(base).s("http").h("a").p("/b/c/d;p").q("y").z(); + + // #?y http://a/b/c/d;p?y as per RFC3986 + test("?y#") + .p("").q("y").f("").z() + .rslv(base).s("http").h("a").p("/b/c/d;p").q("y").f("").z(); + + // #?y http://a/b/c/d;p?y as per RFC3986 + test("//a/b/c/?y#") + .p("/b/c/").h("a").q("y").f("").z() + .rtvz(IRI.of("//a/b/c/")).p("").q("y").f("").z(); + + // g?y http://a/b/c/g?y + test("g?y") + .p("g").q("y").z() + .rslv(base).s("http").h("a").p("/b/c/g").q("y").z(); + + // #s (current document)#s + // DEVIATION: Lone fragment parses as relative URI with empty path + test("#s") + .p("").f("s").z() + .rslv(base).s("http").h("a").p("/b/c/d;p").f("s").q("q").z(); + + test("#s") + .p("").f("s").z() + .rslv(IRI.of("a/b/c#f")).p("a/b/c").f("s").z(); + + test("#s") + .p("").f("s").z() + .rslv(IRI.of("a/b/c?q#f")).p("a/b/c").f("s").q("q").z(); + + // g#s http://a/b/c/g#s + test("g#s") + .p("g").f("s").z() + .rslv(base).s("http").h("a").p("/b/c/g").f("s").z(); + + // g?y#s http://a/b/c/g?y#s + test("g?y#s") + .p("g").q("y").f("s").z() + .rslv(base).s("http").h("a").p("/b/c/g").q("y").f("s").z(); + + // ;x http://a/b/c/;x + test(";x") + .p(";x").z() + .rslv(base).s("http").h("a").p("/b/c/;x").z(); + + // g;x http://a/b/c/g;x + test("g;x") + .p("g;x").z() + .rslv(base).s("http").h("a").p("/b/c/g;x").z(); + + // g;x?y#s http://a/b/c/g;x?y#s + test("g;x?y#s") + .p("g;x").q("y").f("s").z() + .rslv(base).s("http").h("a").p("/b/c/g;x").q("y").f("s").z(); + + // . http://a/b/c/ + test(".") + .p(".").z() + .rslv(base).s("http").h("a").p("/b/c/").z(); + + // ./ http://a/b/c/ + test("./") + .p("./").z() + .rslv(base).s("http").h("a").p("/b/c/").z(); + + // .. http://a/b/ + test("..") + .p("..").z() + .rslv(base).s("http").h("a").p("/b/").z(); + + // ../ http://a/b/ + test("../") + .p("../").z() + .rslv(base).s("http").h("a").p("/b/").z(); + + // ../g http://a/b/g + test("../g") + .p("../g").z() + .rslv(base).s("http").h("a").p("/b/g").z(); + + // ../.. http://a/ + test("../..") + .p("../..").z() + .rslv(base).s("http").h("a").p("/").z(); + + // ../../ http://a/ + test("../../") + .p("../../").z() + .rslv(base).s("http").h("a").p("/").z(); + + // ../../g http://a/g + test("../../g") + .p("../../g").z() + .rslv(base).s("http").h("a").p("/g").z(); + + + header("RFC2396: Abnormal relative-URI examples (appendix C)"); + + // g. = http://a/b/c/g. + test("g.") + .p("g.").z() + .rslv(base).s("http").h("a").p("/b/c/g.").z(); + + // .g = http://a/b/c/.g + test(".g") + .p(".g").z() + .rslv(base).s("http").h("a").p("/b/c/.g").z(); + + // g.. = http://a/b/c/g.. + test("g..") + .p("g..").z() + .rslv(base).s("http").h("a").p("/b/c/g..").z(); + + // ..g = http://a/b/c/..g + test("..g") + .p("..g").z() + .rslv(base).s("http").h("a").p("/b/c/..g").z(); + + // ./../g = http://a/b/g + test("./../g") + .p("./../g").z() + .rslv(base).s("http").h("a").p("/b/g").z(); + + // ./g/. = http://a/b/c/g/ + test("./g/.") + .p("./g/.").z() + .rslv(base).s("http").h("a").p("/b/c/g/").z(); + + // g/./h = http://a/b/c/g/h + test("g/./h") + .p("g/./h").z() + .rslv(base).s("http").h("a").p("/b/c/g/h").z(); + + // g/../h = http://a/b/c/h + test("g/../h") + .p("g/../h").z() + .rslv(base).s("http").h("a").p("/b/c/h").z(); + + // g;x=1/./y = http://a/b/c/g;x=1/y + test("g;x=1/./y") + .p("g;x=1/./y").z() + .rslv(base).s("http").h("a").p("/b/c/g;x=1/y").z(); + + // g;x=1/../y = http://a/b/c/y + test("g;x=1/../y") + .p("g;x=1/../y").z() + .rslv(base).s("http").h("a").p("/b/c/y").z(); + + // g?y/./x = http://a/b/c/g?y/./x + test("g?y/./x") + .p("g").q("y/./x").z() + .rslv(base).s("http").h("a").p("/b/c/g").q("y/./x").z(); + + // g?y/../x = http://a/b/c/g?y/../x + test("g?y/../x") + .p("g").q("y/../x").z() + .rslv(base).s("http").h("a").p("/b/c/g").q("y/../x").z(); + + // g#s/./x = http://a/b/c/g#s/./x + test("g#s/./x") + .p("g").f("s/./x").z() + .rslv(base).s("http").h("a").p("/b/c/g").f("s/./x").z(); + + // g#s/../x = http://a/b/c/g#s/../x + test("g#s/../x") + .p("g").f("s/../x").z() + .rslv(base).s("http").h("a").p("/b/c/g").f("s/../x").z(); + + // http:g = http:g + test("http:g") + .s("http").o("g").p("g").z() + .rslv(base).s("http").o("g").p("g").z(); + + } + + // most of the cases are covered by rfc2396(), this one + // only covers what has been modified in RFC3986 and + // RFC3987 + static void rfc3986() throws URISyntaxException { + header("RFC3986: Basic examples"); + IRI base = (test("http://a/b/c/d;p?q") + .s("http").h("a").p("/b/c/d;p").q("q").z().uri()); + // ?y = http://a/b/c/d;p?y + test("?y").p("").q("y").z() + .rslv(base) + .s("http").h("a").p("/b/c/d;p").q("y").z(); + + // "" = http://a/b/c/d;p?q + test("").p("").z() + .rslv(base) + .s("http").h("a").p("/b/c/d;p").q("q").z(); + + // /./g = http://a/g + test("/./g").p("/./g").z() + .rslv(base) + .s("http").h("a").p("/g").z(); + + header("RFC3986: Abnormal relative-URI examples (appendix C) - diff from RFC 2396"); + + // /./g = http://a/g + test("/./g") + .p("/./g").z() + .rslv(base).s("http").h("a").p("/g").z(); + + + // ../../../g = http://a/g + test("../../../g") + .p("../../../g").z() + .rslv(base).s("http").h("a").p("/g").z(); + + // ../../../../g = http://a/g + test("../../../../g") + .p("../../../../g").z() + .rslv(base).s("http").h("a").p("/g").z(); + + + // /../g = http://a/g + test("/../g") + .p("/../g").z() + .rslv(base).s("http").h("a").p("/g").z(); + + test("/../../g") + .p("/../../g").z() + .rslv(base).s("http").h("a").p("/g").z(); + + header("RFC3986: scheme only"); + + // http: = http: + //test("http:").s("http").p("").z() + // .rslv(base) + // .s("http").p("").z(); + + header("RFC3986: opaque URI with query and fragment"); + + // 6350321 + test("host:opa:que?query#fragment") + .s("host").o("opa:que?query") + .p("opa:que").q("query").f("fragment").z(); + + header("RFC3987: URI -> IRI mapping"); + IRI uri = null; + + // an IRI with a char excluded from iunreserved in it + test("http://foo.com/invalid\ufffdreplacement") + .x().z(); + + // an IRI with a char excluded from iunreserved in it + IRI u1 = test("http://foo.com/invalid%ef%bf%bdreplacement") + .s("http").h("foo.com") + .p("/invalid%ef%bf%bdreplacement") + .pd("/invalid\ufffdreplacement") + .ti("http://foo.com/invalid%EF%BF%BDreplacement") + .ts("http://foo.com/invalid%ef%bf%bdreplacement") + .z().uri(); + + // an IRI with a char excluded from iunreserved in it + IRI u2 = test("http", "foo.com", "/invalid\ufffdreplacement", null, null) + .s("http").h("foo.com") + .p("/invalid%EF%BF%BDreplacement") + .pd("/invalid\ufffdreplacement") + .ti("http://foo.com/invalid%EF%BF%BDreplacement") + .ts("http://foo.com/invalid%EF%BF%BDreplacement") + .z().uri(); + eq(u1, u2); + + // an IRI with invalid %C0%AF sequence + // verify it's not decoded as / - see RFC 3987 section 8 + // Note: U+C0AF is a valid character, but it encodes in UTF-8 with 3 bytes + // into %EC%82%AF - not as %C0%AF + u1 = test("http", "foo.com", "/with%C0%AF0xC0AF%EC%82%AF\uc0af", null) + .s("http").h("foo.com") + .p("/with%C0%AF0xC0AF%EC%82%AF\uc0af") + .pd("/with%C0%AF0xC0AF\uc0af\uc0af") + .ti("http://foo.com/with%C0%AF0xC0AF\uc0af\uc0af") + .ta("http://foo.com/with%C0%AF0xC0AF%EC%82%AF%EC%82%AF") + .ts("http://foo.com/with%C0%AF0xC0AF%EC%82%AF\uc0af") + .z().uri(); + u2 = test("http://foo.com/with%C0%AF0xC0AF%EC%82%AF\uc0af") + .s("http").h("foo.com") + .p("/with%C0%AF0xC0AF%EC%82%AF\uc0af") + .pd("/with%C0%AF0xC0AF\uc0af\uc0af") + .ti("http://foo.com/with%C0%AF0xC0AF\uc0af\uc0af") + .z().uri(); + eq(u2, u1); + + // legal UTF-8 sequence mixed with bidi formatting character + uri = test("http://www.example.org/D%C3%BC%E2%80%AE%C3%BCrst") + .s("http").h("www.example.org").p("/D%C3%BC%E2%80%AE%C3%BCrst") + .pd("/D\u00FC%E2%80%AE\u00FCrst").z().uri(); + eq(uri.toIRIString(), "http://www.example.org/D\u00FC%E2%80%AE\u00FCrst"); + + // strictly legal UTF-8 sequence mixed with not strictly legal UTF-8 sequence + uri = test("http://www.example.org/D%C3%BC%FC%C3%BCrst") + .s("http").h("www.example.org").p("/D%C3%BC%FC%C3%BCrst") + .pd("/D\u00FC%FC\u00FCrst").z().uri(); + eq(uri.toIRIString(), "http://www.example.org/D\u00FC%FC\u00FCrst"); + + // toIRIString() involves decoding operation, which is quite dangerous. + // Especially it should not decode illegal character in corresponding + // components, e.g. character '^' (ASCII value 0x5E) is illegal in path + uri = test("http", "www.example.org", "/D%C3%BCrst^", null, null) + .s("http").h("www.example.org").p("/D%C3%BCrst%5E").z().uri(); + eq(uri.toIRIString(), "http://www.example.org/D\u00FCrst%5E"); + + // an IRI with a legal UTF-8 sequence in it + uri = test("http://www.example.org/D%C3%BCrst") + .s("http").h("www.example.org").p("/D%C3%BCrst").z().uri(); + eq(uri.toIRIString(), "http://www.example.org/D\u00FCrst"); + + // an IRI with a not strictly legal UTF-8 sequence in it + uri = test("http://www.example.org/D%FCrst") + .s("http").h("www.example.org") + .p("/D%FCrst").pd("/D%FCrst") + .z().uri(); + eq(uri.toIRIString(), "http://www.example.org/D%FCrst"); + + // an IRI with a bidi formatting character in it + // in this case, the bidi formatting character is RLO + uri = test("http://xn--99zt52a.example.org/%e2%80%ae") + .s("http").h("xn--99zt52a.example.org").p("/%e2%80%ae").z().uri(); + eq(uri.toIRIString(), "http://xn--99zt52a.example.org/%E2%80%AE"); + + // unencoded bidi is not allowed + test("http://xn--99zt52a.example.org/a\u202eb").x().z(); + test("http", "xn--99zt52a.example.org", "/a\u202eb", null) + .s("http").h("xn--99zt52a.example.org") + .p("/a%E2%80%AEb") + .pd("/a%E2%80%AEb").z(); + test("http", "//xn--99zt52a.example.org/a\u202eb", null) + .s("http").h("xn--99zt52a.example.org") + .p("/a%E2%80%AEb") + .pd("/a%E2%80%AEb").z(); + test("http", "xn--99zt52a.example.org", "/a\u202eb", null, null) + .s("http").h("xn--99zt52a.example.org") + .p("/a%E2%80%AEb") + .pd("/a%E2%80%AEb").z(); + test("http", null, "xn--99zt52a.example.org", -1, "/a\u202eb", null, null) + .s("http").h("xn--99zt52a.example.org") + .p("/a%E2%80%AEb") + .pd("/a%E2%80%AEb").z(); + + // bug 6345502 + test("scheme://userInfo@:5555/home/root?a=b#ABC") + .s("scheme").u("userInfo").h("").n(5555) + .p("/home/root").q("a=b").f("ABC").z(); + + // bug 6363889 + test("s://@/path").s("s").u("").h("").p("/path").z(); + test("s://@").s("s").u("").h("").p("").z(); + test("s://@:8000/path").s("s").u("").h("").n(8000).p("/path").z(); + test("s://@:8000").s("s").u("").h("").n(8000).p("").z(); + + header("Case of %encoded gen-delims: ssp starts with %2F%2F (//)"); + + IRI i1 = test("s:%2F%2F%42u@h/foo?q#f").s("s") + .o("%2F%2FBu@h/foo?q") + .sp("%2F%2F%42u@h/foo?q") + .spd("%2F%2FBu@h/foo?q") + .p("%2F%2F%42u@h/foo") + .pd("%2F%2FBu@h/foo") + .q("q").f("f") + .ts("s:%2F%2F%42u@h/foo?q#f") + .ti("s:%2F%2FBu@h/foo?q#f") + .z().uri(); + test("s", null, "%2F%2F%42u@h/foo", "q", "f") + .iae().z(); // relative path in absolute uri + test("s", null, null, -1, "%2F%2F%42u@h/foo", "q", "f") + .iae().z(); // relative path in absolute uri + IRI i2 = test("s", "%2F%2F%42u@h/foo?q", "f") + .s("s") + .o("%2F%2FBu@h/foo?q") + .sp("%2F%2F%42u@h/foo?q") + .spd("%2F%2FBu@h/foo?q") + .p("%2F%2F%42u@h/foo") + .pd("%2F%2FBu@h/foo") + .q("q").f("f") + .ts("s:%2F%2F%42u@h/foo?q#f") + .ti("s:%2F%2FBu@h/foo?q#f") + .z().uri(); + eq(i1,i2); + IRI i3 = test("%2F%2F%42u@h/foo?q#f") + .sp("%2F%2F%42u@h/foo?q") + .spd("%2F%2FBu@h/foo?q") + .p("%2F%2F%42u@h/foo") + .pd("%2F%2FBu@h/foo") + .q("q").f("f") + .ts("%2F%2F%42u@h/foo?q#f") + .ti("%2F%2FBu@h/foo?q#f") + .z().uri(); + IRI i4 = test(null, null, "%2F%2F%42u@h/foo", "q", "f") + .sp("%2F%2F%42u@h/foo?q") + .spd("%2F%2FBu@h/foo?q") + .p("%2F%2F%42u@h/foo") + .pd("%2F%2FBu@h/foo") + .q("q").f("f") + .ts("%2F%2F%42u@h/foo?q#f") + .ti("%2F%2FBu@h/foo?q#f") + .z().uri(); + eq(i3,i4); + IRI i5 = test(null, null, null, -1, "%2F%2F%42u@h/foo", "q", "f") + .sp("%2F%2F%42u@h/foo?q") + .spd("%2F%2FBu@h/foo?q") + .p("%2F%2F%42u@h/foo") + .pd("%2F%2FBu@h/foo") + .q("q").f("f") + .ts("%2F%2F%42u@h/foo?q#f") + .ti("%2F%2FBu@h/foo?q#f") + .z().uri(); + eq(i3,i5); + IRI i6 = test(null, "%2F%2F%42u@h/foo?q", "f") + .sp("%2F%2F%42u@h/foo?q") + .spd("%2F%2FBu@h/foo?q") + .p("%2F%2F%42u@h/foo") + .pd("%2F%2FBu@h/foo") + .q("q").f("f") + .ts("%2F%2F%42u@h/foo?q#f") + .ti("%2F%2FBu@h/foo?q#f") + .z().uri(); + eq(i3, i6); + IRI i7 = test(null, "//%42u@h/foo?q", "f") + .sp("//%42u@h/foo?q") + .spd("//Bu@h/foo?q") + .p("/foo").pd("/foo") + .u("%42u").ud("Bu").h("h") + .g("%42u@h").gd("Bu@h") + .q("q").f("f") + .ts("//%42u@h/foo?q#f") + .ti("//Bu@h/foo?q#f") + .z().uri(); + ne(i6, i7); + + header("Case of %encoded gen-delims: host contains %2F (/)"); + + i1 = test("s://%42u@h%2Ffoo?q#f").s("s").p("") + .u("%42u").ud("Bu") + .h("h%2Ffoo").hd("h/foo") + .g("%42u@h%2Ffoo").gd("Bu@h/foo") + .q("q").f("f") + .ts("s://%42u@h%2Ffoo?q#f") + .ti("s://Bu@h%2Ffoo?q#f") + .z().uri(); + i2 = test("s", "%42u@h%2Ffoo", "", "q", "f") + .s("s").p("").q("q").f("f") + .u("%42u").ud("Bu") + .h("h%2Ffoo").hd("h/foo") + .g("%42u@h%2Ffoo").gd("Bu@h/foo") + .z().uri(); + eq(i1,i2); + i3 = test("s", "%42u@h/foo", "", "q", "f") + .s("s").p("").q("q").f("f") + .u("%42u").ud("Bu") + .h("h%2Ffoo").hd("h/foo") + .g("%42u@h%2Ffoo").gd("Bu@h/foo") + .z().uri(); + eq(i1,i3); + // IRIs are still equals if a non reserverd %encoded char is replaced + // by its decoded form + i4 = test("s", "Bu@h/foo", "", "q", "f") + .s("s").p("").q("q").f("f") + .u("Bu").ud("Bu") + .h("h%2Ffoo").hd("h/foo") + .g("Bu@h%2Ffoo").gd("Bu@h/foo") + .z().uri(); + eq(i1,i4); + i5 = test("s", "Bu", "h/foo", -1, "", "q", "f") + .s("s").p("").q("q").f("f") + .u("Bu").ud("Bu") + .h("h%2Ffoo").hd("h/foo") + .g("Bu@h%2Ffoo").gd("Bu@h/foo") + .z().uri(); + eq(i1,i5); + i6 = test("s://%42u@h%2ffoo?q#f").s("s").p("") + .u("%42u").ud("Bu") + .h("h%2ffoo").hd("h/foo") + .g("%42u@h%2ffoo").gd("Bu@h/foo") + .q("q").f("f") + .ts("s://%42u@h%2ffoo?q#f") + .ti("s://Bu@h%2Ffoo?q#f") + .z().uri(); + eq(i1,i6); + + header("Case of %encoded gen-delims: user-info contains %40 (@)"); + + i1 = test("s://%42%40u@h%2Ffoo?q#f").s("s").p("") + .u("%42%40u").ud("B@u") + .h("h%2Ffoo").hd("h/foo") + .g("%42%40u@h%2Ffoo").gd("B%40u@h/foo") + .q("q").f("f") + .ts("s://%42%40u@h%2Ffoo?q#f") + .ti("s://B%40u@h%2Ffoo?q#f") + .z().uri(); + i2 = test("s", "%42%40u@h%2Ffoo", "", "q", "f") + .s("s").p("").q("q").f("f") + .u("%42%40u").ud("B@u") + .h("h%2Ffoo").hd("h/foo") + .g("%42%40u@h%2Ffoo").gd("B%40u@h/foo") + .z().uri(); + eq(i1,i2); + i3 = test("s", "%42%40u@h/foo", "", "q", "f") + .s("s").p("").q("q").f("f") + .u("%42%40u").ud("B@u") + .h("h%2Ffoo").hd("h/foo") + .g("%42%40u@h%2Ffoo").gd("B%40u@h/foo") + .z().uri(); + eq(i1,i3); + // IRIs are still equals if a non reserverd %encoded char is replaced + // by its decoded form + i4 = test("s", "B%40u@h/foo", "", "q", "f") + .s("s").p("").q("q").f("f") + .u("B%40u").ud("B@u") + .h("h%2Ffoo").hd("h/foo") + .g("B%40u@h%2Ffoo").gd("B%40u@h/foo") + .z().uri(); + eq(i1,i4); + i5 = test("s", "B@u", "h/foo", -1, "", "q", "f") + .s("s").p("").q("q").f("f") + .u("B%40u").ud("B@u") + .h("h%2Ffoo").hd("h/foo") + .g("B%40u@h%2Ffoo").gd("B%40u@h/foo") + .z().uri(); + eq(i1,i5); + i6 = test("s://%42%40u@h%2ffoo?q#f").s("s").p("") + .u("%42%40u").ud("B@u") + .h("h%2ffoo").hd("h/foo") + .g("%42%40u@h%2ffoo").gd("B%40u@h/foo") + .q("q").f("f") + .ts("s://%42%40u@h%2ffoo?q#f") + .ti("s://B%40u@h%2Ffoo?q#f") + .z().uri(); + eq(i1,i6); + + header("Case of %encoded gen-delims: host contains %3A43 (:43)"); + + i1 = test("s://%42%40u@h%2Ffoo%3A43:80?q#f").s("s").p("") + .u("%42%40u").ud("B@u") + .h("h%2Ffoo%3A43").hd("h/foo:43").n(80) + .g("%42%40u@h%2Ffoo%3A43:80").gd("B%40u@h/foo%3A43:80") + .q("q").f("f") + .ts("s://%42%40u@h%2Ffoo%3A43:80?q#f") + .ti("s://B%40u@h%2Ffoo%3A43:80?q#f") + .z().uri(); + i2 = test("s", "%42%40u@h%2Ffoo%3A43:80", "", "q", "f") + .s("s").p("").q("q").f("f") + .u("%42%40u").ud("B@u") + .h("h%2Ffoo%3A43").hd("h/foo:43").n(80) + .g("%42%40u@h%2Ffoo%3A43:80").gd("B%40u@h/foo%3A43:80") + .z().uri(); + eq(i1,i2); + i3 = test("s", "%42%40u@h/foo%3A43:80", "", "q", "f") + .s("s").p("").q("q").f("f") + .u("%42%40u").ud("B@u") + .h("h%2Ffoo%3A43").hd("h/foo:43").n(80) + .g("%42%40u@h%2Ffoo%3A43:80").gd("B%40u@h/foo%3A43:80") + .z().uri(); + eq(i1,i3); + // IRIs are still equals if a non reserverd %encoded char is replaced + // by its decoded form + i4 = test("s", "B%40u@h/foo%3a43:80", "", "q", "f") + .s("s").p("").q("q").f("f") + .u("B%40u").ud("B@u") + .h("h%2Ffoo%3a43").hd("h/foo:43").n(80) + .g("B%40u@h%2Ffoo%3a43:80").gd("B%40u@h/foo%3A43:80") + .z().uri(); + eq(i1,i4); + i5 = test("s", "B@u", "h/foo:43", 80, "", "q", "f") + .s("s").p("").q("q").f("f") + .u("B%40u").ud("B@u") + .h("h%2Ffoo%3A43").hd("h/foo:43").n(80) + .g("B%40u@h%2Ffoo%3A43:80").gd("B%40u@h/foo%3A43:80") + .z().uri(); + eq(i1,i5); + i6 = test("s://%42%40u@h%2ffoo%3a43:80?q#f").s("s").p("") + .u("%42%40u").ud("B@u") + .h("h%2ffoo%3a43").hd("h/foo:43").n(80) + .g("%42%40u@h%2ffoo%3a43:80").gd("B%40u@h/foo%3A43:80") + .q("q").f("f") + .ts("s://%42%40u@h%2ffoo%3a43:80?q#f") + .ti("s://B%40u@h%2Ffoo%3A43:80?q#f") + .z().uri(); + eq(i1,i6); + + header("Case of %encoded gen-delims: path contains %2F%5B%42%5D%3A%2F (/[B]:/)"); + + i1 = test("s://%42%40u@h%2Ffoo%3A43:80/%2F%5B%42%5D%3A%2Fbar?q#f").s("s") + .p("/%2F%5B%42%5D%3A%2Fbar").pd("/%2F[B]:%2Fbar") + .u("%42%40u").ud("B@u") + .h("h%2Ffoo%3A43").hd("h/foo:43").n(80) + .g("%42%40u@h%2Ffoo%3A43:80").gd("B%40u@h/foo%3A43:80") + .q("q").f("f") + .ts("s://%42%40u@h%2Ffoo%3A43:80/%2F%5B%42%5D%3A%2Fbar?q#f") + .ta("s://%42%40u@h%2Ffoo%3A43:80/%2F%5B%42%5D%3A%2Fbar?q#f") + .ti("s://B%40u@h%2Ffoo%3A43:80/%2F%5BB%5D:%2Fbar?q#f") + .z().uri(); + i2 = test("s", "%42%40u@h%2Ffoo%3A43:80", "/%2F[%42]:%2Fbar", "q", "f") + .p("/%2F%5B%42%5D:%2Fbar").pd("/%2F[B]:%2Fbar") + .s("s").q("q").f("f") + .u("%42%40u").ud("B@u") + .h("h%2Ffoo%3A43").hd("h/foo:43").n(80) + .g("%42%40u@h%2Ffoo%3A43:80").gd("B%40u@h/foo%3A43:80") + .ts("s://%42%40u@h%2Ffoo%3A43:80/%2F%5B%42%5D:%2Fbar?q#f") + .ta("s://%42%40u@h%2Ffoo%3A43:80/%2F%5B%42%5D:%2Fbar?q#f") + .ti("s://B%40u@h%2Ffoo%3A43:80/%2F%5BB%5D:%2Fbar?q#f") + .z().uri(); + eq(i1,i2); + i3 = test("s", "%42%40u@h/foo%3A43:80", "/%2F%5BB%5D%3A%2Fbar", "q", "f") + .p("/%2F%5BB%5D%3A%2Fbar").pd("/%2F[B]:%2Fbar") + .s("s").q("q").f("f") + .u("%42%40u").ud("B@u") + .h("h%2Ffoo%3A43").hd("h/foo:43").n(80) + .g("%42%40u@h%2Ffoo%3A43:80").gd("B%40u@h/foo%3A43:80") + .ts("s://%42%40u@h%2Ffoo%3A43:80/%2F%5BB%5D%3A%2Fbar?q#f") + .ta("s://%42%40u@h%2Ffoo%3A43:80/%2F%5BB%5D%3A%2Fbar?q#f") + .ti("s://B%40u@h%2Ffoo%3A43:80/%2F%5BB%5D:%2Fbar?q#f") + .z().uri(); + eq(i1,i3); + // IRIs are still equals if a non reserverd %encoded char is replaced + // by its decoded form + i4 = test("s", "B%40u@h/foo%3a43:80", "/%2f%5BB%5D%3a%2Fbar", "q", "f") + .p("/%2f%5BB%5D%3a%2Fbar").pd("/%2F[B]:%2Fbar") + .s("s").q("q").f("f") + .u("B%40u").ud("B@u") + .h("h%2Ffoo%3a43").hd("h/foo:43").n(80) + .g("B%40u@h%2Ffoo%3a43:80").gd("B%40u@h/foo%3A43:80") + .ts("s://B%40u@h%2Ffoo%3a43:80/%2f%5BB%5D%3a%2Fbar?q#f") + .ta("s://B%40u@h%2Ffoo%3a43:80/%2f%5BB%5D%3a%2Fbar?q#f") + .ti("s://B%40u@h%2Ffoo%3A43:80/%2F%5BB%5D:%2Fbar?q#f") + .z().uri(); + eq(i1,i4); + i5 = test("s", "B@u", "h/foo:43", 80, "/%2F[B]:%2Fbar", "q", "f") + .p("/%2F%5BB%5D:%2Fbar").pd("/%2F[B]:%2Fbar") + .s("s").q("q").f("f") + .u("B%40u").ud("B@u") + .h("h%2Ffoo%3A43").hd("h/foo:43").n(80) + .g("B%40u@h%2Ffoo%3A43:80").gd("B%40u@h/foo%3A43:80") + .ts("s://B%40u@h%2Ffoo%3A43:80/%2F%5BB%5D:%2Fbar?q#f") + .ta("s://B%40u@h%2Ffoo%3A43:80/%2F%5BB%5D:%2Fbar?q#f") + .ti("s://B%40u@h%2Ffoo%3A43:80/%2F%5BB%5D:%2Fbar?q#f") + .z().uri(); + eq(i1,i5); + i6 = test("s://%42%40u@h%2ffoo%3a43:80/%2f%5b%42%5d%3a%2fbar?q#f").s("s") + .p("/%2f%5b%42%5d%3a%2fbar").pd("/%2F[B]:%2Fbar") + .u("%42%40u").ud("B@u") + .h("h%2ffoo%3a43").hd("h/foo:43").n(80) + .g("%42%40u@h%2ffoo%3a43:80").gd("B%40u@h/foo%3A43:80") + .q("q").f("f") + .ts("s://%42%40u@h%2ffoo%3a43:80/%2f%5b%42%5d%3a%2fbar?q#f") + .ti("s://B%40u@h%2Ffoo%3A43:80/%2F%5BB%5D:%2Fbar?q#f") + .z().uri(); + eq(i1,i6); + + header("Bidi characters and non-characters in BMP"); + + Consumer bidiAndNonCharBMPTest = (c) -> { + char ch = c; + String s = "http", h="xn--99zt52a.example.org", p="/a" + ch + 'b'; + String u = String.format("%s://%s%s", s, h, p); + String pe = appendEscape(new StringBuilder("/a"), ch).append('b').toString(); + String ts = String.format("%s://%s%s", s, h, pe); + + test(u).x().z(); + test(ts).s(s).h(h).p(pe).pd(pe) + .ts(ts).ti(ts).z(); + test(s, h, p, null) + .s(s).h(h).p(pe).pd(pe) + .ts(ts).ti(ts).z(); + test(s, "//" + h + p, null) + .s(s).h(h).p(pe).pd(pe).sp("//"+h+pe) + .ts(ts).ti(ts).z(); + test(s, h, p, null, null) + .s(s).h(h).p(pe).pd(pe) + .ts(ts).ti(ts).z(); + test(s, null, h, -1, p, null, null) + .s(s).h(h).p(pe).pd(pe) + .ts(ts).ti(ts).z(); + }; + + // Build a stream that contains all bidi chars and all + // non characters in the Basic Multilingual Plane, and + // verify they are rejected/encoded properly + Stream.concat(Stream.concat(BIDIS.stream(), + Stream.iterate((char)0xFDD0, + (Character c) -> c <= 0xFDEF, + (Character c) -> (char)(c + 1))), + Stream.of((char)0xFFFE, (char) 0xFFFF)) + .forEach(bidiAndNonCharBMPTest); + + header("Non characters in supplementary planes"); + + IntConsumer nonCharsCodepointsTest = (c) -> { + char[] ch = Character.toChars(c); + assert ch.length == 2; + out.println(String.format("\n-- Plane %d, codepoint: U+%H", c/0x10000, c)); + char[] pc = {'/', 'a', ch[0], ch[1], 'b'}; + String p = new String(pc); + String s = "http", h="xn--99zt52a.example.org"; + String u = String.format("%s://%s%s", s, h, p); + String pe = appendEscape(new StringBuilder("/a"), ch[0], ch[1]) + .append('b').toString(); + String ts = String.format("%s://%s%s", s, h, pe); + + test(u).x().z(); + test(ts).s(s).h(h).p(pe).pd(pe) + .ts(ts).ti(ts).z(); + test(s, h, p, null) + .s(s).h(h).p(pe).pd(pe) + .ts(ts).ti(ts).z(); + test(s, "//" + h + p, null) + .s(s).h(h).p(pe).pd(pe).sp("//"+h+pe) + .ts(ts).ti(ts).z(); + test(s, h, p, null, null) + .s(s).h(h).p(pe).pd(pe) + .ts(ts).ti(ts).z(); + test(s, null, h, -1, p, null, null) + .s(s).h(h).p(pe).pd(pe) + .ts(ts).ti(ts).z(); + }; + + // two non chars per plane + IntStream.range(1,16) // supplementary planes 1-15 + .flatMap((i) -> IntStream.of( + (i << 16) + 0x00FFFE, (i << 16) + 0x00FFFF)) + .forEach(nonCharsCodepointsTest); + IntStream.of(0x10FFFE, 0x10FFFF) // supplementary plane 16 + .forEach(nonCharsCodepointsTest); + + header("Private characters"); + + IntConsumer iprivateInQueryTest = (cp) -> { + char[] ch = Character.toChars(cp); + out.println(String.format("\n-- Plane %d, private codepoint: U+%H", cp/0x10000, cp)); + assert ch.length <= 2; // 1 if BMP, 2 otherwise (surrogate pair) + char[] qc = ch.length == 1 + ? new char[] {'q', '=', 'a', ch[0], 'b'} + : new char[] {'q', '=', 'a', ch[0], ch[1], 'b'}; + String q = new String(qc); + String s = "http", h="xn--99zt52a.example.org", p="/p"; + String u = String.format("%s://%s%s?%s", s, h, p, q); + String qe = appendEscape(new StringBuilder("q=a"), ch) + .append('b').toString(); + String ta = String.format("%s://%s%s?%s", s, h, p, qe); + + test(u).s(s).h(h).p(p) + .q(q).qd(q) + .ts(u).ti(u).z(); + test(ta).s(s).h(h).p(p) + .q(qe).qd(q) + .ts(ta).ta(ta).ti(u).z(); + test(s, h, p, q, null) + .s(s).h(h).p(p) + .q(q).qd(q) + .ts(u).ti(u).ta(ta).z(); + test(s, "//" + h + p + "?" + q, null) + .s(s).h(h).p(p).q(q).qd(q) + .sp("//"+h+p+"?"+q) + .ts(u).ti(u).ta(ta).z(); + test(s, null, h, -1, p, q, null) + .s(s).h(h).p(p) + .q(q).qd(q) + .ts(u).ti(u).ta(ta).z(); + }; + + IntConsumer iprivateInPathTest = (cp) -> { + char[] ch = Character.toChars(cp); + out.println(String.format("\n-- Plane %d, private codepoint: U+%H", cp/0x10000, cp)); + assert ch.length <= 2; // 1 if BMP, 2 otherwise (surrogate pair) + char[] qc = ch.length == 1 + ? new char[] {'q', '=', 'a', ch[0], 'b'} + : new char[] {'q', '=', 'a', ch[0], ch[1], 'b'}; + char[] pc = ch.length == 1 + ? new char[] {'/', 'a', ch[0], 'b'} + : new char[] {'/', 'a', ch[0], ch[1], 'b'}; + String q = new String(qc); + String p = new String(pc); + String s = "http", h="xn--99zt52a.example.org"; + String u = String.format("%s://%s%s?%s", s, h, p, q); + String qe = appendEscape(new StringBuilder("q=a"), ch) + .append('b').toString(); + String pe = appendEscape(new StringBuilder("/a"), ch) + .append('b').toString(); + String ue = String.format("%s://%s%s?%s", s, h, pe, q); + String ta = String.format("%s://%s%s?%s", s, h, pe, qe); + + test(u).x().z(); + test(ue).s(s).h(h) + .p(pe).pd(p) + .q(q).qd(q) + .ts(ue).ti(ue).ta(ta).z(); + test(ta).s(s).h(h) + .p(pe).pd(p) + .q(qe).qd(q) + .ts(ta).ta(ta).ti(ue).z(); + test(s, h, p, q, null) + .s(s).h(h) + .p(pe).pd(p) + .q(q).qd(q) + .ts(ue).ti(ue).ta(ta).z(); + test(s, "//" + h + p + "?" + q, null) + .s(s).h(h).p(pe).pd(p).q(q).qd(q) + .sp("//"+h+pe+"?"+q) + .spd("//"+h+p+"?"+q) + .ts(ue).ti(ue).ta(ta).z(); + test(s, null, h, -1, p, q, null) + .s(s).h(h).p(pe).pd(p) + .q(q).qd(q) + .ts(ue).ti(ue).ta(ta).z(); + }; + + IntConsumer iprivateInFragTest = (cp) -> { + char[] ch = Character.toChars(cp); + out.println(String.format("\n-- Plane %d, private codepoint: U+%H", cp/0x10000, cp)); + assert ch.length <= 2; // 1 if BMP, 2 otherwise (surrogate pair) + char[] qc = ch.length == 1 + ? new char[] {'q', '=', 'a', ch[0], 'b'} + : new char[] {'q', '=', 'a', ch[0], ch[1], 'b'}; + char[] fc = ch.length == 1 + ? new char[] {'f', 'a', ch[0], 'b'} + : new char[] {'f', 'a', ch[0], ch[1], 'b'}; + String q = new String(qc); + String f = new String(fc); + String p = "/p", pe = "/p"; + String s = "http", h="xn--99zt52a.example.org"; + String u = String.format("%s://%s%s?%s#%s", s, h, pe, q, f); + String qe = appendEscape(new StringBuilder("q=a"), ch) + .append('b').toString(); + String fe = appendEscape(new StringBuilder("fa"), ch) + .append('b').toString(); + String ue = String.format("%s://%s%s?%s#%s", s, h, pe, q, fe); + String ta = String.format("%s://%s%s?%s#%s", s, h, pe, qe, fe); + + test(u).x().z(); + test(ue).s(s).h(h) + .p(pe).pd(p) + .q(q).qd(q) + .f(fe).fd(f) + .ts(ue).ti(ue).ta(ta).z(); + test(ta).s(s).h(h) + .p(pe).pd(p) + .q(qe).qd(q) + .f(fe).fd(f) + .ts(ta).ta(ta).ti(ue).z(); + test(s, h, p, q, f) + .s(s).h(h) + .p(pe).pd(p) + .f(fe).fd(f) + .q(q).qd(q) + .ts(ue).ti(ue).ta(ta).z(); + test(s, "//" + h + p + "?" + q, f) + .s(s).h(h).p(pe).pd(p).q(q).qd(q) + .sp("//"+h+pe+"?"+q) + .spd("//"+h+p+"?"+q) + .f(fe).fd(f) + .ts(ue).ti(ue).ta(ta).z(); + test(s, null, h, -1, p, q, f) + .s(s).h(h).p(pe).pd(p) + .q(q).qd(q) + .f(fe).fd(f) + .ts(ue).ti(ue).ta(ta).z(); + }; + + IntConsumer iprivateInHostTest = (cp) -> { + char[] ch = Character.toChars(cp); + out.println(String.format("\n-- Plane %d, private codepoint: U+%H", cp/0x10000, cp)); + assert ch.length <= 2; // 1 if BMP, 2 otherwise (surrogate pair) + char[] qc = ch.length == 1 + ? new char[] {'q', '=', 'a', ch[0], 'b'} + : new char[] {'q', '=', 'a', ch[0], ch[1], 'b'}; + char[] hc = ch.length == 1 + ? new char[] {'h', '.', ch[0], '.', 'b'} + : new char[] {'h', '.', ch[0], ch[1], '.', 'b'}; + String f = "f", fe = "f"; + String q = new String(qc); + String h = new String(hc); + String p = "/p", pe = "/p"; + String s = "http"; + String u = String.format("%s://%s%s?%s#%s", s, h, pe, q, fe); + String qe = appendEscape(new StringBuilder("q=a"), ch) + .append('b').toString(); + String he = appendEscape(new StringBuilder("h."), ch) + .append(".b").toString(); + String ue = String.format("%s://%s%s?%s#%s", s, he, pe, q, fe); + String ta = String.format("%s://%s%s?%s#%s", s, he, pe, qe, fe); + + test(u).x().z(); + test(ue).s(s) + .h(he).hd(h) + .g(he).gd(h) + .p(pe).pd(p) + .q(q).qd(q) + .f(fe).fd(f) + .ts(ue).ti(ue).ta(ta).z(); + test(ta).s(s) + .h(he).hd(h) + .g(he).gd(h) + .p(pe).pd(p) + .q(qe).qd(q) + .f(fe).fd(f) + .ts(ta).ta(ta).ti(ue).z(); + test(s, h, p, q, f).s(s) + .h(he).hd(h) + .g(he).gd(h) + .p(pe).pd(p) + .f(fe).fd(f) + .q(q).qd(q) + .ts(ue).ti(ue).ta(ta).z(); + test(s, "//" + h + p + "?" + q, f) + .s(s).h(he).hd(h) + .g(he).gd(h) + .p(pe).pd(p).q(q).qd(q) + .sp("//"+he+pe+"?"+q) + .spd("//"+h+p+"?"+q) + .f(fe).fd(f) + .ts(ue).ti(ue).ta(ta).z(); + test(s, null, h, -1, p, q, f) + .s(s).h(he).hd(h) + .g(he).gd(h) + .p(pe).pd(p) + .q(q).qd(q) + .f(fe).fd(f) + .ts(ue).ti(ue).ta(ta).z(); + }; + + IntConsumer iprivateInUserTest = (cp) -> { + char[] ch = Character.toChars(cp); + out.println(String.format("\n-- Plane %d, private codepoint: U+%H", cp/0x10000, cp)); + assert ch.length <= 2; // 1 if BMP, 2 otherwise (surrogate pair) + char[] qc = ch.length == 1 + ? new char[] {'q', '=', 'a', ch[0], 'b'} + : new char[] {'q', '=', 'a', ch[0], ch[1], 'b'}; + char[] uc = ch.length == 1 + ? new char[] {'u', '-', ch[0], '-', 'u'} + : new char[] {'u', '-', ch[0], ch[1], '-', 'u'}; + String f = "f", fe = "f"; + String q = new String(qc); + String ui = new String(uc); + String g = ui + "@h"; + String p = "/p", pe = "/p"; + String s = "http", h="h", he="h"; + String u = String.format("%s://%s%s?%s#%s", s, g, pe, q, fe); + String qe = appendEscape(new StringBuilder("q=a"), ch) + .append('b').toString(); + String uie = appendEscape(new StringBuilder("u-"), ch) + .append("-u").toString(); + String ge = uie + "@h"; + String ue = String.format("%s://%s%s?%s#%s", s, ge, pe, q, fe); + String ta = String.format("%s://%s%s?%s#%s", s, ge, pe, qe, fe); + + test(u).x().z(); + test(ue).s(s) + .h(he).hd(h) + .u(uie).ud(ui) + .g(ge).gd(g) + .p(pe).pd(p) + .q(q).qd(q) + .f(fe).fd(f) + .ts(ue).ti(ue).ta(ta).z(); + test(ta).s(s) + .h(he).hd(h) + .u(uie).ud(ui) + .g(ge).gd(g) + .p(pe).pd(p) + .q(qe).qd(q) + .f(fe).fd(f) + .ts(ta).ta(ta).ti(ue).z(); + test(s, g, p, q, f).s(s) + .h(he).hd(h) + .u(uie).ud(ui) + .g(ge).gd(g) + .p(pe).pd(p) + .f(fe).fd(f) + .q(q).qd(q) + .ts(ue).ti(ue).ta(ta).z(); + test(s, "//" + g + p + "?" + q, f) + .s(s).h(he).hd(h) + .u(uie).ud(ui) + .g(ge).gd(g) + .p(pe).pd(p).q(q).qd(q) + .sp("//"+ge+pe+"?"+q) + .spd("//"+g+p+"?"+q) + .f(fe).fd(f) + .ts(ue).ti(ue).ta(ta).z(); + test(s, ui, h, -1, p, q, f) + .s(s).h(he).hd(h) + .u(uie).ud(ui) + .g(ge).gd(g) + .p(pe).pd(p) + .q(q).qd(q) + .f(fe).fd(f) + .ts(ue).ti(ue).ta(ta).z(); + }; + + header("Private characters in query"); + + // iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD + // test boundaries + one char in the middle of each interval + IntStream.of(0xE000, 0xEFAC, 0xF8FF, + 0xF0000, 0xFCAFE, 0xFFFFD, + 0x100000, 0x10EFAC, 0x10FFFD) + .forEach(iprivateInQueryTest); + + header("Private characters in path"); + + IntStream.of(0xE000, 0xEFAC, 0xF8FF, + 0xF0000, 0xFCAFE, 0xFFFFD, + 0x100000, 0x10EFAC, 0x10FFFD) + .forEach(iprivateInPathTest); + + header("Private characters in fragment"); + + IntStream.of(0xE000, 0xEFAC, 0xF8FF, + 0xF0000, 0xFCAFE, 0xFFFFD, + 0x100000, 0x10EFAC, 0x10FFFD) + .forEach(iprivateInFragTest); + + header("Private characters in host"); + + IntStream.of(0xE000, 0xEFAC, 0xF8FF, + 0xF0000, 0xFCAFE, 0xFFFFD, + 0x100000, 0x10EFAC, 0x10FFFD) + .forEach(iprivateInHostTest); + + header("Private characters in user-info"); + + IntStream.of(0xE000, 0xEFAC, 0xF8FF, + 0xF0000, 0xFCAFE, 0xFFFFD, + 0x100000, 0x10EFAC, 0x10FFFD) + .forEach(iprivateInUserTest); + + } + + static void ip() { + + header("IP addresses"); + + test("http://1.2.3.4:5") + .s("http").h("1.2.3.4").ipv4().n(5).p("").z(); + + // From RFC2732 + + test("http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html") + .s("http").h("[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]") + .ipv6().n(80).p("/index.html").z(); + + test("http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:10%12]:80/index.html") + .s("http").h("[FEDC:BA98:7654:3210:FEDC:BA98:7654:10%12]") + .ipv6().n(80).p("/index.html").z(); + + test("http://[1080:0:0:0:8:800:200C:417A]/index.html") + .s("http").h("[1080:0:0:0:8:800:200C:417A]") + .ipv6().p("/index.html").z(); + + test("http://[1080:0:0:0:8:800:200C:417A%1]/index.html") + .s("http").h("[1080:0:0:0:8:800:200C:417A%1]") + .ipv6().p("/index.html").z(); + + test("http://[3ffe:2a00:100:7031::1]") + .s("http").h("[3ffe:2a00:100:7031::1]") + .ipv6().p("").z(); + + test("http://[1080::8:800:200C:417A]/foo") + .s("http").h("[1080::8:800:200C:417A]").ipv6().p("/foo").z(); + + test("http://[::192.9.5.5]/ipng") + .s("http").h("[::192.9.5.5]").ipv6().p("/ipng").z(); + + test("http://[::192.9.5.5%interface]/ipng") + .s("http").h("[::192.9.5.5%interface]").ipv6().p("/ipng").z(); + + test("http://[::FFFF:129.144.52.38]:80/index.html") + .s("http").h("[::FFFF:129.144.52.38]") + .ipv6().n(80).p("/index.html").z(); + + test("http://[2010:836B:4179::836B:4179]") + .s("http").h("[2010:836B:4179::836B:4179]").ipv6().p("").z(); + + // From RFC2373 + + test("http://[FF01::101]") + .s("http").h("[FF01::101]").ipv6().p("").z(); + + test("http://[::1]") + .s("http").h("[::1]").ipv6().p("").z(); + + test("http://[::]") + .s("http").h("[::]").ipv6().p("").z(); + + test("http://[::%hme0]") + .s("http").h("[::%hme0]").ipv6().p("").z(); + + test("http://[0:0:0:0:0:0:13.1.68.3]") + .s("http").h("[0:0:0:0:0:0:13.1.68.3]").ipv6().p("").z(); + + test("http://[0:0:0:0:0:FFFF:129.144.52.38]") + .s("http").h("[0:0:0:0:0:FFFF:129.144.52.38]").ipv6().p("").z(); + + test("http://[0:0:0:0:0:FFFF:129.144.52.38%33]") + .s("http").h("[0:0:0:0:0:FFFF:129.144.52.38%33]").ipv6().p("").z(); + + test("http://[0:0:0:0:0:ffff:1.2.3.4]") + .s("http").h("[0:0:0:0:0:ffff:1.2.3.4]").ipv6().p("").z(); + + test("http://[::13.1.68.3]") + .s("http").h("[::13.1.68.3]").ipv6().p("").z(); + + // Optional IPv6 brackets in constructors + + test("s", null, "1:2:3:4:5:6:7:8", -1, null, null, null) + .s("s").h("[1:2:3:4:5:6:7:8]").ipv6().p("").z(); + + test("s", null, "[1:2:3:4:5:6:7:8]", -1, null, null, null) + .s("s").h("[1:2:3:4:5:6:7:8]").ipv6().p("").z(); + + test("s", null, "[1:2:3:4:5:6:7:8]", -1, null, null, null) + .s("s").h("[1:2:3:4:5:6:7:8]").ipv6().p("").z(); + + test("s", "1:2:3:4:5:6:7:8", null, null) + .s("s").h("[1:2:3:4:5:6:7:8]").ipv6().p("").z(); + + test("s", "1:2:3:4:5:6:7:8%hme0", null, null) + .s("s").h("[1:2:3:4:5:6:7:8%hme0]").ipv6().p("").z(); + + test("s", "1:2:3:4:5:6:7:8%1", null, null) + .s("s").h("[1:2:3:4:5:6:7:8%1]").ipv6().p("").z(); + + test("s", "[1:2:3:4:5:6:7:8]", null, null) + .s("s").h("[1:2:3:4:5:6:7:8]").ipv6().p("").z(); + + test("s", "[1:2:3:4:5:6:7:8]", null, null, null) + .s("s").h("[1:2:3:4:5:6:7:8]").ipv6().p("").z(); + + // Error cases + + test("s", "1:2:3:4:5:6:7:8", null, null, null) + .x().z(); + test("http://[ff01:234/foo").x().z(); + test("http://[ff01:234:zzz]/foo").x().z(); + test("http://[foo]").x().z(); + test("http://[]").x().z(); + test("http://[129.33.44.55]").x().z(); + test("http://[ff:ee:dd:cc:bb::aa:9:8]").x().z(); + test("http://[fffff::1]").x().z(); + test("http://[ff::ee::8]").x().z(); + test("http://[1:2:3:4::5:6:7:8]").x().z(); + test("http://[1:2]").x().z(); + test("http://[1:2:3:4:5:6:7:8:9]").x().z(); + test("http://[1:2:3:4:5:6:7:8%]").x().z(); + test("http://[1:2:3:4:5:6:7:8%!/]").x().z(); + test("http://[::1.2.3.300]").x().z(); + test("http://[1.2.3.4:5]").x().z(); + test("http://1:2:3:4:5:6:7:8").x().z(); + test("http://[1.2.3.4]/").x().z(); + test("http://[1.2.3.4/").x().z(); + test("http://[foo]/").x().z(); + test("http://[foo/").x().z(); + test("s", "[foo]", "/", null, null) + .s("s").h("%5Bfoo%5D").hd("[foo]") // as per RFC3986, parses as reg-name + .regn().p("/").z(); + test("s", "[foo", "/", null, null) + .s("s").h("%5Bfoo").hd("[foo") // as per RFC3986, parses as reg-name + .regn().p("/").z(); + test("s", "[::foo", "/", null, null).x().z(); + + // Test hostnames that might initially look like IPv4 addresses + + // TODO: is this really reasonable? + test("http://1.2.3") // as per RFC3986, parses as reg-name + .s("http").h("1.2.3").regn().p("").z(); + test("http://1.2.3.300") // as per RFC3986, parses as reg-name + .s("http").h("1.2.3.300").regn().p("").z(); + test("http://1.2.3.4.5") // as per RFC3986, parses as reg-name + .s("http").h("1.2.3.4.5").regn().p("").z(); + + test("s://1.2.3.com").psa().s("s").h("1.2.3.com").dns().p("").z(); + test("s://1.2.3.4me.com").psa().s("s").h("1.2.3.4me.com").dns().p("").z(); + + test("s://7up.com").psa().s("s").h("7up.com").dns().p("").z(); + test("s://7up.com/p").psa().s("s").h("7up.com").dns().p("/p").z(); + test("s://7up").psa().s("s").h("7up").p("").dns().z(); + test("s://7up/p").psa().s("s").h("7up").dns().p("/p").z(); + test("s://7up.").psa().s("s").h("7up.").dns().p("").z(); + test("s://7up./p").psa().s("s").h("7up.").dns().p("/p").z(); + + // bug 6345551 + test("s", "ui@[::0000]", "/path", null, null) + .s("s").u("ui").h("[::0000]").ipv6().p("/path").z(); + + // IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) + test("s://[v8.a]/path").s("s").h("[v8.a]").ipvf().p("/path").z(); + test("s://[v8.a:b]/path").s("s").h("[v8.a:b]").ipvf().p("/path").z(); + test("s://[v8.a:]/path").s("s").h("[v8.a:]").ipvf().p("/path").z(); + test("s://[v8.:a]/path").s("s").h("[v8.:a]").ipvf().p("/path").z(); + test("s://[v8.2008]/path").s("s").h("[v8.2008]").ipvf().p("/path").z(); + test("s://[v8.~]/path").s("s").h("[v8.~]").ipvf().p("/path").z(); + test("s://[v8.$]/path").s("s").h("[v8.$]").ipvf().p("/path").z(); + test("s://[v8.:]/path").s("s").h("[v8.:]").ipvf().p("/path").z(); + test("s://ui@[v8.a:b]:80/path").ipvf() + .s("s").u("ui").h("[v8.a:b]").n(80).p("/path").z(); + test("s://@[v8.a:b]:80/path").ipvf() + .s("s").u("").h("[v8.a:b]").n(80).p("/path").z(); + test("s://ui@[v8.a:b]:/path").ipvf() + .s("s").u("ui").h("[v8.a:b]").p("/path").z(); + test("s://@[v8.a:b]:/path").ipvf() + .s("s").u("").h("[v8.a:b]").p("/path").z(); + test("s://[v8]/path").x().z(); + test("s://[vv.]/path").x().z(); + test("s://[v8.@]/path").x().z(); + test("s://@[vf0123456789aBcDe..::::a0-._~:!$&'()*+,;=]/path") + .s("s").u("").ipvf() + .h("[vf0123456789aBcDe..::::a0-._~:!$&'()*+,;=]") + .p("/path").z(); + test("s://[v8.]/path").x().z(); + test("s://[v.]/path").x().z(); + test("s://[v]/path").x().z(); + + // Non dns hosts + + test("s://100\u20AC.com/path") + .s("s") + .h("100\u20AC.com") + .hd("100\u20AC.com") + .p("/path") + .regn() + .z(); + test("s://100%E2%82%AC.com/path") + .s("s") + .h("100%E2%82%AC.com") + .hd("100\u20AC.com") + .p("/path") + .regn() + .z(); + test("s://x_z.com/path") + .s("s") + .h("x_z.com") + .p("/path") + .regn() + .z(); + test("s://%41%42%43.com/path") + .s("s") + .h("%41%42%43.com") + .hd("ABC.com") + .p("/path") + .dns() // The host name is a DNS name - though the raw host is not + .z(); + } + + + static void misc() throws URISyntaxException { + + IRI base = IRI.parseIRI("s://h/a/b"); + IRI rbase = IRI.parseIRI("a/b/c/d"); + + + header("Corner cases"); + + // The empty URI parses as a relative URI with an empty path + test("").p("").z() + .rslv(base) + .s("s").h("h").p("/a/b").z(); // as in RFC3986 + + // Resolving solo queries and fragments + test("#f").p("").f("f").z() + .rslv(base).s("s").h("h").p("/a/b").f("f").z(); + test("?q").p("").q("q").z() + .rslv(base) + .s("s").h("h").p("/a/b").q("q").z(); // as in RFC3986 + + // Fragment is not part of ssp + test("p#f").p("p").f("f").sp("p").z(); + test("s:p#f").s("s").o("p").p("p").f("f").z(); + test("p#f") + .rslv(base).s("s").h("h").p("/a/p").f("f").sp("//h/a/p").z(); + test("").p("").sp("").z(); + + // scheme only + test("abc:").s("abc").p("").sp("").z(); + test("abc:/").s("abc").p("/").sp("/").z(); + test("abc://").s("abc").p("").sp("//").g("").h("").z(); // as in RFC3986 + test("abc:///").s("abc").p("/").sp("///").g("").h("").z(); // as in RFC3986 + test("abc://?").s("abc").p("").sp("//?").g("").h("").q("").z(); // as in RFC3986 + test("abc://#").s("abc").p("").sp("//").g("").h("").f("").z(); // as in RFC3986 + + + header("Emptiness"); + + // Components that may be empty + test("//").p("").g("").h("").z(); // Authority (w/o path) + test("///").p("/").g("").h("").z(); // Authority (w/ path) + test("///p").p("/p").g("").h("").z(); // Authority (w/ path) + test("//@h/p").u("").h("h").p("/p").z(); // User info + test("//h:/p").h("h").p("/p").z(); // Port + test("//h").h("h").p("").z(); // Path + test("//h?q").h("h").p("").q("q").z(); // Path (w/query) + test("//?q").p("").q("q").g("").h("").z(); // Authority (w/query) + test("//#f").p("").f("f").g("").h("").z(); // Authority (w/fragment) + test("p?#").p("p").q("").f("").z(); // Query & fragment + + // Components that may not be empty + test(":").x().z(); // Scheme + test("x:").s("x").p("").z(); // as in RFC3986 + + header("Resolution, normalization, and relativization"); + + // Resolving relative paths + test("../e/f").p("../e/f").z() + .rslv(rbase).p("a/b/e/f").z(); + test("../../../../d").p("../../../../d").z() + .rslv(rbase).p("../d").z(); + test("../../../d:e").p("../../../d:e").z() + .rslv(rbase).p("./d:e").z(); + test("../../../d:e/f").p("../../../d:e/f").z() + .rslv(rbase).p("./d:e/f").z(); + IRI odd = test("s://h/p/a://b/c/d") + .s("s").h("h").p("/p/a://b/c/d").z() + .rtvz(IRI.of("s://h/p/")) + .p("./a:/b/c/d").z().uri(); + test(odd.toString()) + .p("./a:/b/c/d").norm() + .p("./a:/b/c/d").z(); + + // Normalization + test("a/./c/../d/f").p("a/./c/../d/f").z() + .rst().norm().p("a/d/f").z(); + test("http://a/./b/c/../d?q#f") + .s("http").h("a").p("/./b/c/../d").q("q").f("f").z() + .rst().norm().s("http").h("a").p("/b/d").q("q").f("f").z(); + test("a/../b").p("a/../b").z() + .rst().norm().p("b").z(); + test("a/../b:c").p("a/../b:c").z() + .rst().norm().p("./b:c").z(); + + // Normalization of already normalized URI should yield the + // same URI + Test t1 = test("s://h/p?/../b#/../d").s("s").h("h").p("/p").q("/../b").f("/../d").z(); + IRI u1 = t1.uri(); + IRI u2 = t1.rst().norm().s("s").h("h").p("/p").q("/../b").f("/../d").z().uri(); + eq(u1, u2); + eqeq(u1, u2); + + // Normalization of not already normalized URI should yield + // a different URI + Test t2 = test("s://h/../p").s("s").h("h").p("/../p").z(); // RFC 3986 -> http://h/p + IRI iri1 = t2.uri(); + IRI iri2 = t2.rst().norm().s("s").h("h").p("/p").z().uri(); + ne(iri1, iri2); + + // RFC 3986: normalization also removes redundant colon in authority + test("s://:").s("s").h("").g(":").p("").ts("s://:").z() + .rst().norm().s("s").h("").g("").p("/").ts("s:///").z(); + test("s://x:").s("s").h("x").g("x:").p("").ts("s://x:").z() + .rst().norm().s("s").h("x").g("x").p("/").ts("s://x/").z(); + test("s://@:").s("s").u("").h("").g("@:").p("").ts("s://@:").z() + .rst().norm().s("s").u("").h("").g("@").p("/").ts("s://@/").z(); + test("s://u@x:").s("s").u("u").h("x").g("u@x:").p("").ts("s://u@x:").z() + .rst().norm().s("s").u("u").h("x").g("u@x").p("/").ts("s://u@x/").z(); + test("s://u@:").s("s").u("u").h("").g("u@:").p("").ts("s://u@:").z() + .rst().norm().s("s").u("u").h("").g("u@").p("/").ts("s://u@/").z(); + test("s://:/").s("s").h("").g(":").p("/").ts("s://:/").z() + .rst().norm().s("s").h("").g("").p("/").ts("s:///").z(); + test("s://x:/").s("s").h("x").g("x:").p("/").ts("s://x:/").z() + .rst().norm().s("s").h("x").g("x").p("/").ts("s://x/").z(); + test("s://@:/").s("s").u("").h("").g("@:").p("/").ts("s://@:/").z() + .rst().norm().s("s").u("").h("").g("@").p("/").ts("s://@/").z(); + test("s://u@x:/").s("s").u("u").h("x").g("u@x:").p("/").ts("s://u@x:/").z() + .rst().norm().s("s").u("u").h("x").g("u@x").p("/").ts("s://u@x/").z(); + test("s://u@:/").s("s").u("u").h("").g("u@:").p("/").ts("s://u@:/").z() + .rst().norm().s("s").u("u").h("").g("u@").p("/").ts("s://u@/").z(); + + // normalization and paths ending with .. + test("/..").norm().p("/").z(); + test("../../a/../").norm().p("../../").z(); + test("../../a/..").norm().p("../../").z(); + test("../..").norm().p("../..").z(); + test("..").norm().p("..").z(); + test("s:/..").norm().s("s").p("/").z(); + test("s:///..").norm().s("s").h("").g("").p("/").z(); + test("s:///a/..").norm().s("s").h("").g("").p("/").z(); + test("s:///a/b/..").norm().s("s").h("").g("").p("/a/").z(); + test("s:/a/..").norm().s("s").p("/").z(); + test("s:/a/b/..").norm().s("s").p("/a/").z(); + test("s:..").norm().s("s").p("..").z(); + + // Relativization + test("/a/b").p("/a/b").z() + .rtvz(IRI.parseIRI("/a")).p("a/b").z(); // bug in java.net.URI + test("/a/b").p("/a/b").z() + .rtvz(IRI.parseIRI("/a/")).p("b").z(); + test("a/b").p("a/b").z() + .rtvz(IRI.parseIRI("a")).p("a/b").z(); // bug in java.net.URI + test("/a/b").p("/a/b").z() + .rtvz(IRI.parseIRI("/a/b")).p("").z(); // Result is empty path + test("a/../b:c/d").p("a/../b:c/d").z() + .rtvz(IRI.parseIRI("./b:c/")).p("d").z(); + + test("http://a/b/d/e?q#f") + .s("http").h("a").p("/b/d/e").q("q").f("f").z() + .rtvz(IRI.parseIRI("http://a/b/?r#g")) + .p("d/e").q("q").f("f").z(); + + test("http://a/b/d/e?q#f") // bug in java.net.URI + .s("http").h("a").p("/b/d/e").q("q").f("f").z() + .rtvz(IRI.parseIRI("http://a/b/g?r#g")) + .p("d/e").q("q").f("f").z(); + + // Resolution should preserve redundant colon in authority + // RFC 3986 Section 5.2.1. says that normalization of the + // base URI is optional, and Section 5.2.2 transfers the + // authority unchanged. + // The rule of least surprise suggests we should keep the + // redundant colon if present - which can be removed by + // calling normalize() later on if needed. + + // redundant colon in base URI authority + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://") + .s("s").g("").h("").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://:") + .s("s").g(":").h("").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://:/") + .s("s").g(":").h("").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://:/x") + .s("s").g(":").h("").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://:/x/y") + .s("s").g(":").h("").p("/x/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://@:") + .s("s").g("@:").u("").h("").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://@:/") + .s("s").g("@:").u("").h("").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://@:/x") + .s("s").g("@:").u("").h("").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://@:/x/y") + .s("s").g("@:").u("").h("").p("/x/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://u@:") + .s("s").g("u@:").u("u").h("").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://u@:/") + .s("s").g("u@:").u("u").h("").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://u@:/x") + .s("s").g("u@:").u("u").h("").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://u@:/x/y") + .s("s").g("u@:").u("u").h("").p("/x/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://u@h:") + .s("s").g("u@h:").u("u").h("h").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://u@h:/") + .s("s").g("u@h:").u("u").h("h").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://u@h:/x") + .s("s").g("u@h:").u("u").h("h").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://u@h:/x/y") + .s("s").g("u@h:").u("u").h("h").p("/x/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://h:") + .s("s").g("h:").h("h").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://h:/") + .s("s").g("h:").h("h").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://h:/x") + .s("s").g("h:").h("h").p("/a/b/c").z(); + test("a/b/c").p("a/b/c").z() + .rst().rslv("s://h:/x/y") + .s("s").g("h:").h("h").p("/x/a/b/c").z(); + + // redundant colon in given URI authority + test("///a/b/c").p("/a/b/c").g("").h("").z() + .rst().rslv("s:") + .s("s").g("").h("").p("/a/b/c").z(); + test("//:/a/b/c").p("/a/b/c").g(":").h("").z() + .rst().rslv("s:") + .s("s").g(":").h("").p("/a/b/c").z(); + test("//://a/b/c").p("//a/b/c").g(":").h("").z() + .rst().rslv("s:") + .s("s").g(":").h("").p("/a/b/c").z(); + test("//@:/a/b/c").p("/a/b/c").g("@:").u("").h("").z() + .rst().rslv("s:") + .s("s").g("@:").u("").h("").p("/a/b/c").z(); + test("//@://a/b/c").p("//a/b/c").g("@:").u("").h("").z() + .rst().rslv("s:") + .s("s").g("@:").u("").h("").p("/a/b/c").z(); + test("//u@:/a/b/c").p("/a/b/c").g("u@:").u("u").h("").z() + .rst().rslv("s:") + .s("s").g("u@:").u("u").h("").p("/a/b/c").z(); + test("//u@://a/b/c").p("//a/b/c").g("u@:").u("u").h("").z() + .rst().rslv("s:") + .s("s").g("u@:").u("u").h("").p("/a/b/c").z(); + test("//u@h:/a/b/c").p("/a/b/c").g("u@h:").u("u").h("h").z() + .rst().rslv("s:") + .s("s").g("u@h:").u("u").h("h").p("/a/b/c").z(); + test("//u@h://a/b/c").p("//a/b/c").g("u@h:").u("u").h("h").z() + .rst().rslv("s:") + .s("s").g("u@h:").u("u").h("h").p("/a/b/c").z(); + test("//h:/a/b/c").p("/a/b/c").g("h:").h("h").z() + .rst().rslv("s:") + .s("s").g("h:").h("h").p("/a/b/c").z(); + test("//h://a/b/c").p("//a/b/c").g("h:").h("h").z() + .rst().rslv("s:") + .s("s").g("h:").h("h").p("/a/b/c").z(); + + // resolution against a base URI ending with .. + test("c").rslv("s://h/a/b/..") + .s("s").h("h").p("/a/c"); + test("c").rslv("s://h/a/b/../") + .s("s").h("h").p("/a/c"); + test("c").rslv("..").p("../c"); + test("c").rslv("../..").p("../../c"); + test("c").rslv("../a/..").p("../c"); + + // Some non intuitive corner cases. Some of these are not + // handled correctly by java.net.URI - which is a bug. + IRI abase = IRI.parseIRI("a"); + + test("a/b").p("a/b").z().rtvz(abase).p("a/b").z(); + test("b") .p("b") .z().rtvz(abase).p("b") .z(); + test("ab") .p("ab") .z().rtvz(abase).p("ab") .z(); + test("/b") .p("/b") .z().rtvz(abase).p("/b") .z(); + + test("a/b").p("a/b").z().rslv(abase).p("a/b").z(); + test("b") .p("b") .z().rslv(abase).p("b") .z(); + test("ab") .p("ab") .z().rslv(abase).p("ab") .z(); + test("/b") .p("/b") .z().rslv(abase).p("/b") .z(); + + test("../b").p("../b").z().rtvz(abase).p("../b") .z(); + test("../b").p("../b").z().rslv(abase).p("../b") .z(); + test("../b").p("../b").z().rtvz(rbase).p("../b") .z(); + test("../b").p("../b").z().rslv(rbase).p("a/b/b").z(); + + test("..").p("..").z().rtvz(abase).p("..") .z(); + test("..").p("..").z().rslv(abase).p("..") .z(); + test("..").p("..").z().rtvz(rbase).p("..") .z(); + test("..").p("..").z().rslv(rbase).p("a/b/").z(); + + IRI vbase1 = IRI.parseIRI("http://a/b/c/d;p?q"); + IRI vbase2 = IRI.parseIRI("a/b/c?q#f"); + IRI vbase3 = IRI.parseIRI("//a/b/c/d;p?q"); + + test("//g").h("g").p("").rtvz(vbase1).h("g").p("").z(); + test("//g/x/y").h("g").p("/x/y").rtvz(vbase1).h("g").p("/x/y").z(); + test("#").f("").p("").rtvz(vbase1).p("").f("").z(); + test("").p("").rtvz(vbase1).p("").z(); + test("#s").p("").f("s").rtvz(vbase2).p("").f("s").z(); + test("http://a/b/c/d;p/../d;p?q").rtvz(vbase1).p("").q("q").z(); + test("http://a/b/c/d;p/../d?q").rtvz(vbase1).p("d").q("q").z(); + test("http://a/b/c/d;p/../d;p/b?q").rtvz(vbase1).p("d;p/b").q("q").z(); + test("http://a/b/c/d;p/../d/b?q").rtvz(vbase1).p("d/b").q("q").z(); + test("http://a/b/c/d;p/../../d/b?q").rtvz(vbase1) + .s("http").h("a").p("/b/c/d;p/../../d/b").q("q").z(); + test("//a/b/c/d;p/../d;p/b?q").rtvz(vbase3).p("d;p/b").q("q").z(); + test("//a/b/c/d;p/../d/b?q").rtvz(vbase3).p("d/b").q("q").z(); + test("//a/b/c/d;p/../../d/b?q").rtvz(vbase3) + .h("a").p("/b/c/d;p/../../d/b").q("q").z(); + + // parseServerAuthority + test("/a/b").psa().p("/a/b").z(); + test("s://u@h:1/p") + .psa().s("s").u("u").h("h").n(1).p("/p").z(); + test("s://u@h:-foo/p").x().z(); + test("s://h:999999999999999999999999").x().z(); + test("s://:/b").psa().s("s").h("").p("/b").z(); + + + header("Constructors and factories"); + + test("s", null, null, -1, "p", null, null).x().z(); + test(null, null, null, -1, null, null, null).p("").z(); + test(null, null, null, -1, "p", null, null).p("p").z(); + test(null, null, "foo%20bar", -1, null, null, null) + .h("foo%20bar").hd("foo bar").p("").z(); // as per RFC3986 + test(null, null, "foo", -100, null, null, null).x().z(); + test("s", null, null, -1, "", null, null).s("s").p("").z(); + test("s", null, null, -1, "/p", null, null).s("s").p("/p").z(); + test("s", "u", "h", 10, "/p", "q", "f") + .s("s").u("u").h("h").n(10).p("/p").q("q").f("f").z(); + test("s", "a:b", "/p", "q", "f").x().z(); + test("s", "h", "/p", "f") + .s("s").h("h").p("/p").f("f").z(); + test("s", "p", "f").s("s").o("p").p("p").f("f").z(); + test("s", "/p", "f").s("s").p("/p").f("f").z(); + testCreate("s://u@h/p?q#f") + .s("s").u("u").h("h").p("/p").q("q").f("f").z(); + } + + static void npes() throws URISyntaxException { + + header("NullPointerException"); + + IRI base = IRI.of("mailto:root@foobar.com"); + + out.println(); + + try { + base.resolve((IRI)null); + throw new RuntimeException("NullPointerException not thrown"); + } catch (NullPointerException x) { + out.println("resolve((IRI)null) -->"); + out.println("Correct exception: " + x); + } + + out.println(); + + try { + base.resolve((String)null); + throw new RuntimeException("NullPointerException not thrown"); + } catch (NullPointerException x) { + out.println("resolve((String)null) -->"); + out.println("Correct exception: " + x); + } + + out.println(); + + try { + base.relativize((IRI)null); + throw new RuntimeException("NullPointerException not thrown"); + } catch (NullPointerException x) { + out.println("relativize((String)null) -->"); + out.println("Correct exception: " + x); + } + + testCount += 3; + } + + static MethodType mt(Class res, Object... args) { + Class[] cls = new Class[args.length]; + for (int i=0; i createHierarchical = new ConcurrentHashMap<>(); + + static IRI createHierarchical(Object... params) throws URISyntaxException { + // Lookup either of IRI.createHierarchical methods + MethodHandle mh = createHierarchical + .computeIfAbsent(mt(IRI.class, params), Test::lookupCreateHierarchical); + try { + // Invoke IRI.createHierarchical + return (IRI)mh.invokeWithArguments(params); + } catch (URISyntaxException | RuntimeException | Error x) { + throw x; + } catch(Throwable x) { + throw new RuntimeException("Unexpected exception: " + x, x); + } + + } + + static void iaes() throws URISyntaxException { + + header("IllegalArgumentException"); + + + out.println(); + Object[][] iaes = { + {"s", "userinfo", null, -1, "/p", null, null}, + {"s", null, null, 0, "/p", null, null}, + {"s", null, null, -1, "//p", null, null}, + {"s", null, "//p", null, null}, + {null, "userinfo", null, -1, "/p", null, null}, + {null, null, null, 0, "/p", null, null}, + {null, null, null, -1, "//p", null, null}, + {null, null, "//p", null, null}, + {null, null, null, -1, "a://b/c/d", null, null}, + {null, null, "a://b/c/d", null, null}, + {null, null, "", -1, "a://b/c/d", null, null}, + {null, "", "a://b/c/d", null, null}, + {null, "", "u@h:80/p", null, null}, + {null, "", "", -1, "u@h:80/p", null, null}, + {null, null, "", -1, "u@h:80/p", null, null}, + {"s", "", "u@h:80/p", null, null}, + {"s", "", "", -1, "u@h:80/p", null, null}, + {"s", null, "", -1, "u@h:80/p", null, null}, + {"s", "userinfo", "", -1, "a://b/c/d", null, null}, + {"s", null, "", 0, "a://b/c/d", null, null}, + }; + + for (Object[] params : iaes) { + try { + String msg = "IRI.createHierarchical" + Arrays.asList(params).toString() + .replace("[","(").replace("]", ")"); + out.println(msg); + IRI iri = createHierarchical(params); + throw new RuntimeException( + "Expected IllegalArgumentException not raised for: " + msg); + } catch (IllegalArgumentException x) { + System.out.println("\t --> Got expected exception: " + x); + } + testCount++; + } + + test("s", "a://b/c/d", null) + .o("a://b/c/d") + .p("a://b/c/d") + .s("s").z(); + test("s", "", "/a://b/c/d", null) + .p("/a://b/c/d").s("s").h("").z(); + test("s", "", "/a://b/c/d", null, null) + .p("/a://b/c/d").s("s").h("").g("").z(); + test("s", null, "", -1, "/a://b/c/d", null, null) + .p("/a://b/c/d").s("s").h("").g("").z(); + test(null, "", "/a://b/c/d", null) + .p("/a://b/c/d").h("").z(); + test(null, "", "/a://b/c/d", null, null) + .p("/a://b/c/d").h("").g("").z(); + test(null, null, "", -1, "/a://b/c/d", null, null) + .p("/a://b/c/d").h("").g("").z(); + + } + + + static void chars() throws URISyntaxException { + + header("Escapes and non-US-ASCII characters"); + + IRI uri; + + // Escape pairs + test("%0a%0A%0f%0F%01%09zz") + .p("%0a%0A%0f%0F%01%09zz").z(); + test("foo%1").x().z(); + test("foo%z").x().z(); + test("foo%9z").x().z(); + + // Escapes not permitted in scheme, host + test("s%20t://a").x().z(); + test("//a%20b").h("a%20b").p("").z(); // as in RFC3986 + + // Escapes permitted in opaque part, userInfo, registry, path, + // query, and fragment + test("//u%20v@a").u("u%20v").h("a").p("").z(); + test("/p%20q").p("/p%20q").z(); + test("/p?q%20").p("/p").q("q%20").z(); + test("/p#%20f").p("/p").f("%20f").z(); + + // Non-US-ASCII chars + test("s\u00a7t://a").x().z(); + test("//\u00a7/b").h("\u00a7").p("/b").z(); // as in RFC3986 + test("//u\u00a7v@a").u("u\u00a7v").h("a").p("").z(); + test("/p\u00a7q").p("/p\u00a7q").z(); + test("/p?q\u00a7").p("/p").q("q\u00a7").z(); + test("/p#\u00a7f").p("/p").f("\u00a7f").z(); + + // 4648111 - Escapes quoted by toString after resolution + uri = IRI.parseIRI("http://a/b/c/d;p?q"); + test("/p%20p") + .rslv(uri).s("http").h("a").p("/p%20p") + .ts("http://a/p%20p") + .ti("http://a/p%20p").z(); + + test("/p%20p").rtvz(uri).p("/p%20p") + .ts("/p%20p") + .ti("/p%20p").z(); + + test("/p%32p").rtvz(uri).p("/p%32p") + .ts("/p%32p") + .ti("/p2p").z(); + + // 4464135: Forbid unwise characters throughout opaque part + test("foo:x{bar").x().z(); + test("foo:{bar").x().z(); + + // 4438319: Single-argument constructor requires quotation, + // preserves escapes + test("//u%01@h/a/b/%02/c?q%03#f%04") + .u("u%01").ud("u\1") + .h("h") + .p("/a/b/%02/c").pd("/a/b/\2/c") + .q("q%03").qd("q\3") + .f("f%04").fd("f\4") + .z(); + test("/a/b c").x().z(); + + // 4438319: Multi-argument constructors quote illegal chars and + // preserve legal non-ASCII chars + // \uA001-\uA009 are visible characters, \u2000 is a space character + test(null, "u\uA001\1", "h", -1, + "/p% \uA002\2\u2000", + "q% \uA003\3\u2000", + "f% \uA004\4\u2000") + .u("u\uA001%01").h("h") + .p("/p%25%20\uA002%02%E2%80%80").pd("/p% \uA002\2\u2000") + .q("q%25%20\uA003%03%E2%80%80").qd("q% \uA003\3\u2000") + .f("f%25%20\uA004%04%E2%80%80").fd("f% \uA004\4\u2000").z(); + test(null, "g\uA001\1", + "/p% \uA002\2\u2000", + "q% \uA003\3\u2000", + "f% \uA004\4\u2000") + .h("g\uA001%01") // as in RFC3986 + .p("/p%25%20\uA002%02%E2%80%80").pd("/p% \uA002\2\u2000") + .q("q%25%20\uA003%03%E2%80%80").qd("q% \uA003\3\u2000") + .f("f%25%20\uA004%04%E2%80%80").fd("f% \uA004\4\u2000").z(); + test(null, null, "/p% \uA002\2\u2000", "f% \uA004\4\u2000") + .p("/p%25%20\uA002%02%E2%80%80").pd("/p% \uA002\2\u2000") + .f("f%25%20\uA004%04%E2%80%80").fd("f% \uA004\4\u2000").z(); + test(null, "/sp% \uA001\1\u2000", "f% \uA004\4\u2000") + .sp("/sp%25%20\uA001%01%E2%80%80").spd("/sp% \uA001\1\u2000") + .p("/sp%25%20\uA001%01%E2%80%80").pd("/sp% \uA001\1\u2000") + .f("f%25%20\uA004%04%E2%80%80").fd("f% \uA004\4\u2000").z(); + + // 4438319: Non-raw accessors decode all escaped octets + test("/%25%20%E2%82%AC%E2%80%80") + .p("/%25%20%E2%82%AC%E2%80%80").pd("/% \u20AC\u2000").z(); + + // 4438319: toASCIIString + test("/\uCAFE\uBABE") + .p("/\uCAFE\uBABE").ta("/%EC%AB%BE%EB%AA%BE").z(); + + // 4991359 and 4866303: bad quoting by defineSchemeSpecificPart() + IRI base = IRI.parseIRI("http://host/foo%20bar/a/b/c/d"); + test ("resolve") + .rslv(base).spd("//host/foo bar/a/b/c/resolve") + .sp("//host/foo%20bar/a/b/c/resolve").s("http") + .pd("/foo bar/a/b/c/resolve").h("host") + .p("/foo%20bar/a/b/c/resolve").z(); + + // 6773270: java.net.URI fails to escape u0000 + test("s", "a", "/\u0000", null) + .s("s").p("/%00").h("a") + .ta("s://a/%00").z(); + } + + + static void eq0(IRI u, IRI v) throws URISyntaxException { + testCount++; + if (!u.equals(v)) + throw new RuntimeException("Not equal: " + u + " " + v); + int uh = u.hashCode(); + int vh = v.hashCode(); + if (uh != vh) + throw new RuntimeException("Hash codes not equal: " + + u + " " + Integer.toHexString(uh) + " " + + v + " " + Integer.toHexString(vh)); + out.println(); + out.println(u + " == " + v + + " [" + Integer.toHexString(uh) + "]"); + } + + static void cmp0(IRI u, IRI v, boolean same) + throws URISyntaxException + { + int c = u.compareTo(v); + if ((c == 0) != same) + throw new RuntimeException("Comparison inconsistent: " + u + " " + v + + " " + c); + } + + static void eq(IRI u, IRI v) throws URISyntaxException { + eq0(u, v); + cmp0(u, v, true); + } + + static void eq(String expected, String actual) { + if (expected == null && actual == null) { + return; + } + if (expected != null && expected.equals(actual)) { + return; + } + throw new AssertionError(String.format( + "Strings are not equal: '%s', '%s'", expected, actual)); + } + + static void eqeq(IRI u, IRI v) { + testCount++; + if (u != v) + throw new RuntimeException("Not ==: " + u + " " + v); + } + + static void ne0(IRI u, IRI v) throws URISyntaxException { + testCount++; + if (u.equals(v)) + throw new RuntimeException("Equal: " + u + " " + v); + out.println(); + out.println(u + " != " + v + + " [" + Integer.toHexString(u.hashCode()) + + " " + Integer.toHexString(v.hashCode()) + + "]"); + } + + static void ne(IRI u, IRI v) throws URISyntaxException { + ne0(u, v); + cmp0(u, v, false); + } + + static void lt(IRI u, IRI v) throws URISyntaxException { + ne0(u, v); + int c = u.compareTo(v); + if (c >= 0) { + show(u); + show(v); + throw new RuntimeException("Not less than: " + u + " " + v + + " " + c); + } + out.println(u + " < " + v); + } + + static void lt(String s, String t) throws URISyntaxException { + lt(IRI.parseIRI(s), IRI.parseIRI(t)); + } + + static void gt(IRI u, IRI v) throws URISyntaxException { + lt(v, u); + } + + static void eqHashComp() throws URISyntaxException { + + header("Equality, hashing, and comparison"); + + IRI o = IRI.parseIRI("mailto:foo@bar.com"); + IRI r = IRI.parseIRI("reg://some%20registry/b/c/d?q#f"); + IRI s = IRI.parseIRI("http://jag:cafebabe@java.sun.com:94/b/c/d?q#f"); + eq(o, o); + lt(o, r); + lt(s, o); + lt(s, r); + eq(o, IRI.parseIRI("MaILto:foo@bar.com")); + gt(o, IRI.parseIRI("mailto:foo@bar.COM")); + eq(r, IRI.parseIRI("rEg://some%20registry/b/c/d?q#f")); + eq(r, IRI.parseIRI("reg://Some%20Registry/b/c/d?q#f")); // as in RFC3986 + gt(r, IRI.parseIRI("reg://some%20registry/b/c/D?q#f")); + eq(s, IRI.parseIRI("hTtP://jag:cafebabe@Java.Sun.COM:94/b/c/d?q#f")); + gt(s, IRI.parseIRI("http://jag:CafeBabe@java.sun.com:94/b/c/d?q#f")); + lt(s, IRI.parseIRI("http://jag:cafebabe@java.sun.com:94/b/c/d?r#f")); + lt(s, IRI.parseIRI("http://jag:cafebabe@java.sun.com:94/b/c/d?q#g")); + eq(IRI.parseIRI("http://host/a%00bcd"), IRI.parseIRI("http://host/a%00bcd")); + ne(IRI.parseIRI("http://host/a%00bcd"), IRI.parseIRI("http://host/aZ00bcd")); + eq0(IRI.parseIRI("http://host/abc%e2def%C3ghi"), + IRI.parseIRI("http://host/abc%E2def%c3ghi")); + + lt("p", "s:p"); + lt("s:p", "T:p"); + lt("S:p", "t:p"); + lt("s:/p", "s:p"); + lt("s:p", "s:q"); + lt("s:p#f", "s:p#g"); + lt("s://u@h:1", "s://v@h:1"); + lt("s://u@h:1", "s://u@i:1"); + lt("s://u@h:1", "s://v@h:2"); + lt("s://a%20b", "s://a%20c"); + lt("s://a%20b", "s://aab"); + lt("s://A_", "s://AA"); // as in RFC3986 + lt("s:/p", "s:/q"); + lt("s:/p?q", "s:/p?r"); + lt("s:/p#f", "s:/p#g"); + + lt("s://h", "s://h/p"); + lt("s://h/p", "s://h/p?q"); + + // unnecessary percent-encoded octet + IRI u1 = IRI.of("http://jag:cafebabe@java.sun.com:94/d%75rst/c/d?q#f"); + IRI u2 = IRI.createHierarchical(u1.getScheme(), u1.getUserInfo(), + u1.getHostString(), u1.getPort(), u1.getPath(), + u1.getQuery(), u1.getFragment()); + eq(u1, u2); + + // because IRI.comapreTo() is always called after IRI.equals() in eq(), + // lt(), or gt(), this unit test fails to capture regression reported + // by 6348515. So call cmp0() directly here. + IRI uu1 = IRI.parseIRI("h://a/p"); + IRI uu2 = IRI.parseIRI("h://a/p"); + cmp0(uu1, uu2, true); + + } + + + static void serial(IRI u) throws IOException, URISyntaxException { + + ByteArrayOutputStream bo = new ByteArrayOutputStream(); + ObjectOutputStream oo = new ObjectOutputStream(bo); + + oo.writeObject(u); + oo.close(); + + ByteArrayInputStream bi = new ByteArrayInputStream(bo.toByteArray()); + ObjectInputStream oi = new ObjectInputStream(bi); + try { + Object o = oi.readObject(); + eq(u, (IRI)o); + } catch (ClassNotFoundException x) { + x.printStackTrace(); + throw new RuntimeException(x.toString()); + } + + testCount++; + } + + static void serial() throws IOException, URISyntaxException { + header("Serialization"); + + serial(IRI.of("http://java.sun.com/jdk/1.4?release#beta")); + serial(IRI.of("s://h/p").resolve("/long%20path/")); + } + + + static void urls() throws URISyntaxException { + + header("URLs"); + + IRI uri; + URL url; + boolean caught = false; + + out.println(); + uri = IRI.parseIRI("http://a/p?q#f"); + try { + url = uri.toURL(); + } catch (MalformedURLException x) { + throw new RuntimeException(x.toString()); + } + if (!url.toString().equals("http://a/p?q#f")) + throw new RuntimeException("Incorrect URL: " + url); + out.println(uri + " url --> " + url); + + out.println(); + uri = IRI.parseIRI("a/b"); + try { + out.println(uri + " url --> "); + url = uri.toURL(); + } catch (IllegalArgumentException x) { + caught = true; + out.println("Correct exception: " + x); + } catch (MalformedURLException x) { + caught = true; + throw new RuntimeException("Incorrect exception: " + x); + } + if (!caught) + throw new RuntimeException("Incorrect URL: " + url); + + out.println(); + uri = IRI.parseIRI("foo://bar/baz"); + caught = false; + try { + out.println(uri + " url --> "); + url = uri.toURL(); + } catch (MalformedURLException x) { + caught = true; + out.println("Correct exception: " + x); + } catch (IllegalArgumentException x) { + caught = true; + throw new RuntimeException("Incorrect exception: " + x); + } + if (!caught) + throw new RuntimeException("Incorrect URL: " + url); + + testCount += 3; + } + + static void utils() throws URISyntaxException { + final String[] quoteTests = new String[] { + // unquoted quoted + "%", "%", + "%2", "%2", + "%25", "%2525", + "%2x", "%2x", + "aaaaa%2x%2", "aaaaa%2x%2", + "aaa%aa%2x%2", "aaa%25aa%2x%2", + "aaa%Ba%2x%2%20%a", "aaa%25Ba%2x%2%2520%a", + "%C0%AF", "%25C0%25AF", + "%EF%BF%BD", "%25EF%25BF%25BD", + }; + + header("IRI.quoteEncodedOctets"); + + out.println(); + for (int i=0; i \"" + quoteTests[i + 1] + "\""); + String quoted = IRI.quoteEncodedOctets(quoteTests[i]); + if (!quoted.equals(quoteTests[i+1])) { + out.println(" failed! produced \"" + quoted +"\""); + throw new RuntimeException("IRI.quoteEncodedOctets failed for \"" + + quoteTests[i] +"\""); + } + String unquoted = IRI.unquoteEncodedOctets(quoted, true); + if (!unquoted.equals(quoteTests[i])) { + out.println(" failed! unquoting produced \"" + unquoted +"\""); + throw new RuntimeException("IRI.quoteEncodedOctets failed to unquote for \"" + + quoteTests[i] +"\""); + } + } + + final String[] unquoteTests = new String[] { + // quoted unquoted requoted replaced + "%25", "%", "%", "%", + "x%25", "x%", "x%", "x%", + "%25x", "%x", "%x", "%x", + "%2540", "%40", "%2540", "%40", + "%C0%AF", "%C0%AF", "%25C0%25AF", "\ufffd\ufffd", + "%C0%AF%40", "%C0%AF@", "%25C0%25AF@", "\ufffd\ufffd@", + "%41%40%42%x", "A@B%x", "A@B%x", "A@B%x", + "%EF%BF%BD", "\ufffd", "\ufffd", "\ufffd" + }; + + header("IRI.unquoteEncodedOctets"); + out.println(); + for (int i=0; i \"" + + unquoteTests[i + 1] + "\" [\"" + unquoteTests[i+3] + "\"]"); + String unquoted = IRI.unquoteEncodedOctets(unquoteTests[i], false); + if (!unquoted.equals(unquoteTests[i+1])) { + out.println(" failed! produced \"" + unquoted +"\""); + throw new RuntimeException("IRI.unquoteEncodedOctets failed for \"" + + unquoteTests[i] +"\""); + } + String quoted = IRI.quoteEncodedOctets(unquoted); + if (!quoted.equals(unquoteTests[i+2])) { + out.println(" failed! quoting produced \"" + quoted +"\""); + throw new RuntimeException("IRI.unquoteEncodedOctets failed to quote for \"" + + unquoteTests[i] +"\""); + } + String dequoted = IRI.unquoteEncodedOctets(quoted, false); + if (!dequoted.equals(unquoteTests[i+1])) { + out.println(" failed! dequoting quoted produced \"" + dequoted +"\""); + throw new RuntimeException("IRI.unquoteEncodedOctets failed to dequote for \"" + + unquoteTests[i] +"\""); + } + String ident = IRI.unquoteEncodedOctets(IRI.quoteEncodedOctets(unquoteTests[i]), true); + if (!ident.equals(unquoteTests[i])) { + out.println(" failed! ident produced \"" + ident +"\""); + throw new RuntimeException("IRI.unquoteEncodedOctets failed ident for \"" + + unquoteTests[i] +"\""); + } + String replaced = IRI.unquoteEncodedOctets(unquoteTests[i], true); + if (!replaced.equals(unquoteTests[i+3])) { + out.println(" failed! replaced produced \"" + replaced +"\""); + throw new RuntimeException("IRI.unquoteEncodedOctets failed replaced for \"" + + unquoteTests[i] +"\""); + } + } + } + + static String auth(String u, String h, int port) { + StringBuilder sb = new StringBuilder(); + if (u != null) sb.append(u).append('@'); + if (h != null) sb.append(h); + if (port >= 0) sb.append(':').append(port); + return sb.toString(); + } + + static void lenient() { + header("Lenient parsing (tests for IRI.parseLenient)"); + + for (char c : "|<> {}^`\"\\".toCharArray()) { + String ld = String.valueOf(c); + String l = appendEscape(new StringBuilder(), c).toString(); + String s = "s", u="u", ue = "u" + l, ud = "u" + ld; + String h = "h", he = h + l, hd = h + ld; + int port = 80; + String p = "/p", pe = p + l, pd = p + ld; + String q = "q=q", qe = q + l, qd = q + ld; + String f = "f", fe = f+ l, fd = f + ld; + String a = auth(u, h, port); + + String ae = auth(ue, h, port), ad = auth(ud, h, port); + String in = String.format("%s://%s%s?%s#%s", s, ad, p, q, f); + + lenient(in).s("s") + .u(ue).ud(ud).h(h).hd(h).n(port) + .g(ae).gd(ad) + .p(p).pd(p) + .q(q).qd(q).f(f).fd(f) + .tl(in) + .z(); + + ae = auth(u, he, port); ad = auth(u, hd, port); + in = String.format("%s://%s%s?%s#%s", s, ad, p, q, f); + + lenient(in).s("s") + .u(u).ud(u).h(he).hd(hd).n(port) + .g(ae).gd(ad) + .p(p).pd(p) + .q(q).qd(q).f(f).fd(f) + .tl(in) + .z(); + + in = String.format("%s://%s%s?%s#%s", s, a, pd, q, f); + + lenient(in).s("s") + .u(u).ud(u).h(h).hd(h).n(port) + .g(a).gd(a) + .p(pe).pd(pd) + .q(q).qd(q).f(f).fd(f) + .tl(in) + .z(); + + in = String.format("%s://%s%s?%s#%s", s, a, p, qd, f); + + lenient(in).s("s") + .u(u).ud(u).h(h).hd(h).n(port) + .g(a).gd(a) + .p(p).pd(p) + .q(qe).qd(qd).f(f).fd(f) + .tl(in) + .z(); + + in = String.format("%s://%s%s?%s#%s", s, a, p, q, fd); + + lenient(in).s("s") + .u(u).ud(u).h(h).hd(h).n(port) + .g(a).gd(a) + .p(p).pd(p) + .q(q).qd(q).f(fe).fd(fd) + .tl(in) + .z(); + + ae = auth(ue, he, port); ad = auth(ud, hd, port); + in = String.format("%s://%s%s?%s#%s", s, ad, pd, qd, fd); + + lenient(in).s("s") + .u(ue).ud(ud).h(he).hd(hd).n(port) + .g(ae).gd(ad) + .p(pe).pd(pd) + .q(qe).qd(qd).f(fe).fd(fd) + .tl(in) + .z(); + + s = "mailto"; p="d$$.$$f@o$$.c"; + pe=p.replace("$$", l); pd=p.replace("$$", ld); + in = String.format("%s:%s", s, pd); + + lenient(in).s(s).o(pd).sp(pe).p(pe).pd(pd).tl(in).z(); + + } + + String in = "s://h/p?q=%41|B&%25%34%33"; + lenient(in).s("s").h("h").p("/p") + .q("q=%41%7CB&%25%34%33") + .qd("q=A|B&%43") + .tl("s://h/p?q=%41|B&%25%34%33").z(); + + // no funny characters allowed in scheme, even when parsed + // leniently... + lenient("s|s://h/p").x().z(); + + } + + // ... the printable characters in US-ASCII that are not allowed in + // URIs, namely {@code '<'}, {@code '>'}, {@code '"'}, space, + // {@code '{'}, {@code '}'}, {@code '|'}, {@code '\'}, {@code '^'}, + // and {@code '`'}. + static final String[][] IPUAC = { + { "<", "%3C" }, + { ">", "%3E" }, + { "\"", "%22" }, + { " ", "%20" }, + { "{", "%7B" }, + { "}", "%7D" }, + { "|", "%7C" }, + { "\\", "%5C" }, + { "^", "%5E" }, + { "`", "%60" } }; + + // A number of prefixes and suffixes to test IPUAC's with + static final String[][] PRE_SUF_FIXES = { + { "", "" }, + { "a", "" }, + { "", "b" }, + { "a", "b" }, + { "%", "" }, + { "" , "%" }, + { "%", "%" }, + { "%X", "" }, + { "" , "%Y" }, + { "%Z", "%Z" }, + { "%XX", "" }, + { "" , "%YY" }, + { "%ZZ", "%ZZ" }, + { "%41", "" }, + { "" , "%42" }, + { "%43", "%43" }, + }; + + static void lenientUtilities() { + header("Lenient utilities"); + + eq(null, IRI.quoteLenient(null)); + eq(null, IRI.unquoteLenient(null)); + eq("", IRI.quoteLenient("")); + eq("", IRI.unquoteLenient("")); + + for (String[] values : IPUAC) { + for (String[] preSuf : PRE_SUF_FIXES) { + String raw = preSuf[0] + values[0] + preSuf[1]; + String encoded = preSuf[0] + values[1] + preSuf[1]; + + out.format("raw: %8s, encoded: %s%n", raw, encoded); + + eq(encoded, IRI.quoteLenient(raw)); + eq(raw, IRI.unquoteLenient(encoded)); + eq(raw, IRI.unquoteLenient(IRI.quoteLenient(raw))); + } + } + } + + static void specials() { + header("Special chars U+FFF0-U+FFFD"); + for (char c='\ufff0'; c <= '\ufffd'; c++) { + String hex = appendEscape(new StringBuilder(9), c) + .toString(); + header("char U+FFF" + hexDigits[c & 0x000F]); + test("s://h/p?q="+c).x().z(); + test("s" + c, "h", "/p", null, null) + .x().z(); + test("s", "h", "/p", "q=q", "f"+c) + .s("s").h("h").p("/p").q("q=q") + .fd("f" + c) + .f("f" + hex) + .z(); + test("s", "h", "/p", "q=" + c, null) + .s("s").h("h").p("/p") + .qd("q=" + c) + .q("q=" + hex) + .z(); + test("s", "h" + c, "/p", null, null) + .s("s").hd("h" + c) + .h("h" + hex) + .p("/p") + .z(); + test("s", "h", "/p" + c, null, null) + .s("s").h("h") + .p("/p" + hex) + .pd("/p" + c) + .z(); + } + } + + static void tests() throws IOException, URISyntaxException { + iaes(); + rfc2396(); + rfc3986(); + ip(); + misc(); + chars(); + eqHashComp(); + serial(); + urls(); + npes(); + bugs(); + utils(); + specials(); + lenient(); + lenientUtilities(); + } + + + // -- Command-line invocation -- + + static void usage() { + out.println("Usage:"); + out.println(" java Test -- Runs all tests in this file"); + out.println(" java Test -- Parses uri, shows components"); + out.println(" java Test -- Parses uri and base, then resolves"); + out.println(" uri against base"); + } + + static void clargs(String base, String uri) { + IRI b = null, u; + try { + if (base != null) { + b = IRI.parseIRI(base); + out.println(base); + show(b); + } + u = IRI.parseIRI(uri); + out.println(uri); + show(u); + if (base != null) { + IRI r = b.resolve(u); + out.println(r); + show(r); + } + } catch (URISyntaxException x) { + show("ERROR", x); + x.printStackTrace(out); + } + } + + + // miscellaneous bugs/rfes that don't fit in with the test framework + + static void bugs() { + b7023363(); + b6339649(); + b6933879(); + b8037396(); + } + + // 6339649 - include detail message from nested exception + private static void b6339649() { + try { + IRI uri = IRI.of("http://nowhere.net/should not be permitted"); + } catch (IllegalArgumentException e) { + if ("".equals(e.getMessage()) || e.getMessage() == null) { + throw new RuntimeException ("No detail message"); + } + } + } + + // 7023363: URI("ftp", "[www.abc.com]", "/dir1/dir2", "query", "frag") + // should throw URISyntaxException + private static void b7023363() { + // [www.abc.com] is not a legal IPv6 litteral. + test("ftp://[www.abc.com]/dir1/dir2?query#frag") + .x().z(); + + // as per RFC 3986 "%5Bwww.abc.com%5D" is a legal hostname. + test("ftp", "[www.abc.com]", "/dir1/dir2", "query", "frag") + .s("ftp").hd("[www.abc.com]").h("%5Bwww.abc.com%5D") + .p("/dir1/dir2").q("query").f("frag").z(); + + // If there is a colon enclosed in the [ ... ] then this will be interpreted + // as an IPv6 literal and an exception will be raised. + // This is arguable. + test("ftp", "[www.a:bc.com]", "/dir1/dir2", "query", "frag") + .x().z(); + } + + // 6933879 - check that "." and "_" characters are allowed in IPv6 scope_id. + private static void b6933879() { + final String HOST = "fe80::c00:16fe:cebe:3214%eth1.12_55"; + IRI uri; + try { + uri = IRI.createHierarchical("http", null, HOST, 10, "/", null, null); + } catch (URISyntaxException ex) { + throw new AssertionError("Should not happen", ex); + } + eq("[" + HOST + "]", uri.getHostString()); + } + + private static void b8037396() { + + // primary checks: + + IRI u; + try { + u = IRI.createHierarchical("http", "example.org", "/[a b]", "[a b]", "[a b]"); + } catch (URISyntaxException e) { + throw new AssertionError("shouldn't ever happen", e); + } + eq("/[a b]", u.getPath()); + eq("[a b]", u.getQuery()); + eq("[a b]", u.getFragment()); + + // additional checks: + // * '%' symbols are still decoded outside square brackets + // * the getRawXXX() functionality left intact + + try { + u = IRI.createHierarchical("http", "example.org", "/a b[c d]", "a b[c d]", "a b[c d]"); + } catch (URISyntaxException e) { + throw new AssertionError("shouldn't ever happen", e); + } + + eq("/a b[c d]", u.getPath()); + eq("a b[c d]", u.getQuery()); + eq("a b[c d]", u.getFragment()); + + eq("/a%20b%5Bc%20d%5D", u.getRawPath()); + + // RFC3986: [ ] are gen-delim characters and should be escaped + // in query & fragment too + eq("a%20b%5Bc%20d%5D", u.getRawQuery()); + eq("a%20b%5Bc%20d%5D", u.getRawFragment()); + } + + public static void main(String[] args) throws Exception { + switch (args.length) { + + case 0: + tests(); + out.println(); + out.println("Test cases: " + testCount); + break; + + case 1: + if (args[0].equals("-help")) { + usage(); + break; + } + clargs(null, args[0]); + break; + + case 2: + clargs(args[0], args[1]); + break; + + default: + usage(); + break; + + } + } + +} diff --git a/net/src/main/java/org/xbib/net/Context.java b/net/src/main/java/org/xbib/net/Context.java deleted file mode 100644 index 1ed84a9..0000000 --- a/net/src/main/java/org/xbib/net/Context.java +++ /dev/null @@ -1,8 +0,0 @@ -package org.xbib.net; - -public interface Context { - - Req request(); - - Resp response(); -} diff --git a/net/src/main/java/org/xbib/net/Handler.java b/net/src/main/java/org/xbib/net/Handler.java deleted file mode 100644 index cfef94f..0000000 --- a/net/src/main/java/org/xbib/net/Handler.java +++ /dev/null @@ -1,10 +0,0 @@ -package org.xbib.net; - -import java.io.IOException; - -@SuppressWarnings("rawtypes") -@FunctionalInterface -public interface Handler { - - void handle(C context) throws IOException; -} diff --git a/net/src/main/java/org/xbib/net/HandlerException.java b/net/src/main/java/org/xbib/net/HandlerException.java deleted file mode 100644 index 8483843..0000000 --- a/net/src/main/java/org/xbib/net/HandlerException.java +++ /dev/null @@ -1,21 +0,0 @@ -package org.xbib.net; - -@SuppressWarnings("serial") -public class HandlerException extends RuntimeException { - - public HandlerException() { - super(); - } - - public HandlerException(String message) { - super(message); - } - - public HandlerException(Exception e) { - super(e); - } - - public HandlerException(String message, Exception e) { - super(message, e); - } -} diff --git a/net/src/main/java/org/xbib/net/IRI.java b/net/src/main/java/org/xbib/net/IRI.java index b09aff6..b27b04b 100644 --- a/net/src/main/java/org/xbib/net/IRI.java +++ b/net/src/main/java/org/xbib/net/IRI.java @@ -8,6 +8,7 @@ import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.util.regex.Matcher; import java.util.regex.Pattern; + import org.xbib.net.scheme.Scheme; import org.xbib.net.scheme.SchemeRegistry; import org.xbib.net.util.CharUtils; diff --git a/net/src/main/java/org/xbib/net/URL.java b/net/src/main/java/org/xbib/net/URL.java index a919cf4..869aed8 100755 --- a/net/src/main/java/org/xbib/net/URL.java +++ b/net/src/main/java/org/xbib/net/URL.java @@ -38,10 +38,7 @@ import java.util.Objects; * The reason for the name {@code URL} is merely because of the popularity of the name, which * overweighs the URI or IRI popularity. * - * [source,java] - * -- * URL url = URL.http().resolveFromHost("google.com").build(); - * -- * */ public class URL implements Comparable { diff --git a/net/src/test/java/org/xbib/net/IRITest.java b/net/src/test/java/org/xbib/net/IRITest.java deleted file mode 100644 index e42fae8..0000000 --- a/net/src/test/java/org/xbib/net/IRITest.java +++ /dev/null @@ -1,176 +0,0 @@ -package org.xbib.net; - -import org.junit.jupiter.api.Test; - -import java.text.Normalizer; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotEquals; - -class IRITest { - - @Test - void testIpv4() { - URL iri = URL.create("http://127.0.0.1"); - assertEquals("http://127.0.0.1", iri.toExternalForm()); - } - - @Test - void testIpv6() { - URL iri = URL.from("http://[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]"); - assertEquals(iri.getProtocolVersion(), ProtocolVersion.IPV6); - assertEquals("http://[2001:db8:85a3:8d3:1319:8a2e:370:7344]", iri.toString()); - } - - @Test - void testIpv6Invalid() { - URL iri = URL.from("http://[2001:0db8:85a3:08d3:1319:8a2e:0370:734o]"); - assertEquals(URL.nullUrl(), iri); - } - - @Test - void testSimple() { - URL iri = URL.create("http://validator.w3.org/check?uri=http%3A%2F%2Fr\u00E9sum\u00E9.example.org"); - assertEquals("http://validator.w3.org/check?uri=http%3A%2F%2Fr\u00E9sum\u00E9.example.org", iri.toString()); - } - - @Test - void testFile() throws Exception { - URL iri = URL.create("file:///tmp/test/foo"); - assertEquals("", iri.getHost()); - assertEquals("/tmp/test/foo", iri.getPath()); - assertEquals("file:///tmp/test/foo", iri.toExternalForm()); - assertEquals("file:///tmp/test/foo", iri.toString()); - } - - @Test - void testSimple2() throws Exception { - URL iri = URL.create("http://www.example.org/red%09ros\u00E9#red"); - assertEquals("http://www.example.org/red%09ros%C3%A9#red", iri.toExternalForm()); - } - - @Test - void testNotSoSimple() throws Exception { - URL iri = URL.create("http://example.com/\uD800\uDF00\uD800\uDF01\uD800\uDF02"); - assertEquals("http://example.com/%F0%90%8C%80%F0%90%8C%81%F0%90%8C%82", iri.toExternalForm()); - } - - @Test - void testIRItoURI() throws Exception { - URL iri = URL.from("http://\u7D0D\u8C46.example.org/%E2%80%AE"); - assertEquals("http://xn--99zt52a.example.org/%E2%80%AE", iri.toExternalForm()); - } - - @Test - void testComparison() { - URL url1 = URL.create("http://www.example.org/"); - URL url2 = URL.create("http://www.example.org/.."); - URL url3 = URL.create("http://www.Example.org:80"); - assertNotEquals(url1, url2); - assertNotEquals(url1, url3); - assertNotEquals(url2, url1); - assertNotEquals(url2, url3); - assertNotEquals(url3, url1); - assertNotEquals(url3, url2); - assertEquals(url1.normalize(), url2.normalize()); - assertEquals(url1.normalize(), url3.normalize()); - assertEquals(url2.normalize(), url1.normalize()); - assertEquals(url2.normalize(), url3.normalize()); - assertEquals(url3.normalize(), url1.normalize()); - assertEquals(url3.normalize(), url2.normalize()); - } - - @Test - void testUCN() { - URL iri1 = URL.create("http://www.example.org/r\u00E9sum\u00E9.html"); - String s = Normalizer.normalize("http://www.example.org/re\u0301sume\u0301.html", Normalizer.Form.NFC); - URL iri2 = URL.create(s); - assertEquals(iri2, iri1); - } - - @Test - void testNormalizePath() { - URL iri1 = URL.create("http://example.org/%7e%2Fuser%2f"); - URL iri2 = URL.create("http://example.org/%7E%2fuser/"); - assertEquals(iri1.normalize(), iri2.normalize()); - } - - @Test - void testIDN() { - URL iri1 = URL.from("http://r\u00E9sum\u00E9.example.org"); - assertEquals("xn--rsum-bpad.example.org", iri1.getHost()); - } - - @Test - void testResolveRelative() { - URL base = URL.create("http://example.org/foo/"); - assertEquals("http://example.org/", base.resolve("/").toString()); - assertEquals("http://example.org/test", base.resolve("/test").toString()); - assertEquals("http://example.org/foo/test", base.resolve("test").toString()); - assertEquals("http://example.org/test", base.resolve("../test").toString()); - assertEquals("http://example.org/foo/test", base.resolve("./test").toString()); - assertEquals("http://example.org/foo/", base.resolve("test/test/../../").toString()); - assertEquals("http://example.org/foo/?test", base.resolve("?test").toString()); - assertEquals("http://example.org/foo/#test", base.resolve("#test").toString()); - assertEquals("http://example.org/foo/", base.resolve(".").toString()); - } - - @Test - void testSchemes() { - URL iri = URL.create("http://a:b@c.org:80/d/e?f#g"); - assertEquals("http", iri.getScheme()); - assertEquals("a:b", iri.getUserInfo()); - assertEquals("c.org", iri.getHost()); - assertEquals(Integer.valueOf(80), iri.getPort()); - assertEquals("/d/e", iri.getPath()); - assertEquals("f", iri.getQuery()); - assertEquals("g", iri.getFragment()); - iri = URL.create("https://a:b@c.org:80/d/e?f#g"); - assertEquals("https", iri.getScheme()); - assertEquals("a:b", iri.getUserInfo()); - assertEquals("c.org", iri.getHost()); - assertEquals(Integer.valueOf(80), iri.getPort()); - assertEquals("/d/e", iri.getPath()); - assertEquals("f", iri.getQuery()); - assertEquals("g", iri.getFragment()); - iri = URL.create("ftp://a:b@c.org:80/d/e?f#g"); - assertEquals("ftp", iri.getScheme()); - assertEquals("a:b", iri.getUserInfo()); - assertEquals("c.org", iri.getHost()); - assertEquals(Integer.valueOf(80), iri.getPort()); - assertEquals("/d/e", iri.getPath()); - assertEquals("f", iri.getQuery()); - assertEquals("g", iri.getFragment()); - iri = URL.create("mailto:joe@example.org?subject=foo"); - assertEquals("mailto", iri.getScheme()); - assertEquals(null, iri.getUserInfo()); - assertEquals(null, iri.getHost()); - assertEquals(null, iri.getPort()); - assertEquals("joe@example.org?subject=foo", iri.getSchemeSpecificPart()); - assertEquals(null, iri.getFragment()); - iri = URL.create("tag:example.org,2006:foo"); - assertEquals("tag", iri.getScheme()); - assertEquals(null, iri.getUserInfo()); - assertEquals(null, iri.getHost()); - assertEquals(null, iri.getPort()); - assertEquals("example.org,2006:foo", iri.getSchemeSpecificPart()); - assertEquals(null, iri.getQuery()); - assertEquals(null, iri.getFragment()); - iri = URL.create("urn:lsid:ibm.com:example:82437234964354895798234d"); - assertEquals("urn", iri.getScheme()); - assertEquals(null, iri.getUserInfo()); - assertEquals(null, iri.getHost()); - assertEquals(null, iri.getPort()); - assertEquals("lsid:ibm.com:example:82437234964354895798234d", iri.getSchemeSpecificPart()); - assertEquals(null, iri.getQuery()); - assertEquals(null, iri.getFragment()); - iri = URL.create("data:image/gif;base64,R0lGODdhMAAwAPAAAAAAAP"); - assertEquals("data", iri.getScheme()); - assertEquals(null, iri.getUserInfo()); - assertEquals(null, iri.getHost()); - assertEquals(null, iri.getPort()); - assertEquals("image/gif;base64,R0lGODdhMAAwAPAAAAAAAP", iri.getSchemeSpecificPart()); - assertEquals(null, iri.getQuery()); - assertEquals(null, iri.getFragment()); - } -} diff --git a/net/src/test/java/org/xbib/net/URLTest.java b/net/src/test/java/org/xbib/net/URLTest.java index 698f341..609a014 100644 --- a/net/src/test/java/org/xbib/net/URLTest.java +++ b/net/src/test/java/org/xbib/net/URLTest.java @@ -6,10 +6,12 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; import org.junit.jupiter.api.Test; +import java.text.Normalizer; import java.util.ArrayList; import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.fail; class URLTest { @@ -116,4 +118,168 @@ class URLTest { boolean skip; } + @Test + void testIpv4() { + URL iri = URL.create("http://127.0.0.1"); + assertEquals("http://127.0.0.1", iri.toExternalForm()); + } + + @Test + void testIpv6() { + URL iri = URL.from("http://[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]"); + assertEquals(iri.getProtocolVersion(), ProtocolVersion.IPV6); + assertEquals("http://[2001:db8:85a3:8d3:1319:8a2e:370:7344]", iri.toString()); + } + + @Test + void testIpv6Invalid() { + URL iri = URL.from("http://[2001:0db8:85a3:08d3:1319:8a2e:0370:734o]"); + assertEquals(URL.nullUrl(), iri); + } + + @Test + void testSimple() { + URL iri = URL.create("http://validator.w3.org/check?uri=http%3A%2F%2Fr\u00E9sum\u00E9.example.org"); + assertEquals("http://validator.w3.org/check?uri=http%3A%2F%2Fr\u00E9sum\u00E9.example.org", iri.toString()); + } + + @Test + void testFile() throws Exception { + URL iri = URL.create("file:///tmp/test/foo"); + assertEquals("", iri.getHost()); + assertEquals("/tmp/test/foo", iri.getPath()); + assertEquals("file:///tmp/test/foo", iri.toExternalForm()); + assertEquals("file:///tmp/test/foo", iri.toString()); + } + + @Test + void testSimple2() throws Exception { + URL iri = URL.create("http://www.example.org/red%09ros\u00E9#red"); + assertEquals("http://www.example.org/red%09ros%C3%A9#red", iri.toExternalForm()); + } + + @Test + void testNotSoSimple() throws Exception { + URL iri = URL.create("http://example.com/\uD800\uDF00\uD800\uDF01\uD800\uDF02"); + assertEquals("http://example.com/%F0%90%8C%80%F0%90%8C%81%F0%90%8C%82", iri.toExternalForm()); + } + + @Test + void testIRItoURI() throws Exception { + URL iri = URL.from("http://\u7D0D\u8C46.example.org/%E2%80%AE"); + assertEquals("http://xn--99zt52a.example.org/%E2%80%AE", iri.toExternalForm()); + } + + @Test + void testComparison() { + URL url1 = URL.create("http://www.example.org/"); + URL url2 = URL.create("http://www.example.org/.."); + URL url3 = URL.create("http://www.Example.org:80"); + assertNotEquals(url1, url2); + assertNotEquals(url1, url3); + assertNotEquals(url2, url1); + assertNotEquals(url2, url3); + assertNotEquals(url3, url1); + assertNotEquals(url3, url2); + assertEquals(url1.normalize(), url2.normalize()); + assertEquals(url1.normalize(), url3.normalize()); + assertEquals(url2.normalize(), url1.normalize()); + assertEquals(url2.normalize(), url3.normalize()); + assertEquals(url3.normalize(), url1.normalize()); + assertEquals(url3.normalize(), url2.normalize()); + } + + @Test + void testUCN() { + URL iri1 = URL.create("http://www.example.org/r\u00E9sum\u00E9.html"); + String s = Normalizer.normalize("http://www.example.org/re\u0301sume\u0301.html", Normalizer.Form.NFC); + URL iri2 = URL.create(s); + assertEquals(iri2, iri1); + } + + @Test + void testNormalizePath() { + URL iri1 = URL.create("http://example.org/%7e%2Fuser%2f"); + URL iri2 = URL.create("http://example.org/%7E%2fuser/"); + assertEquals(iri1.normalize(), iri2.normalize()); + } + + @Test + void testIDN() { + URL iri1 = URL.from("http://r\u00E9sum\u00E9.example.org"); + assertEquals("xn--rsum-bpad.example.org", iri1.getHost()); + } + + @Test + void testResolveRelative() { + URL base = URL.create("http://example.org/foo/"); + assertEquals("http://example.org/", base.resolve("/").toString()); + assertEquals("http://example.org/test", base.resolve("/test").toString()); + assertEquals("http://example.org/foo/test", base.resolve("test").toString()); + assertEquals("http://example.org/test", base.resolve("../test").toString()); + assertEquals("http://example.org/foo/test", base.resolve("./test").toString()); + assertEquals("http://example.org/foo/", base.resolve("test/test/../../").toString()); + assertEquals("http://example.org/foo/?test", base.resolve("?test").toString()); + assertEquals("http://example.org/foo/#test", base.resolve("#test").toString()); + assertEquals("http://example.org/foo/", base.resolve(".").toString()); + } + + @Test + void testSchemes() { + URL iri = URL.create("http://a:b@c.org:80/d/e?f#g"); + assertEquals("http", iri.getScheme()); + assertEquals("a:b", iri.getUserInfo()); + assertEquals("c.org", iri.getHost()); + assertEquals(Integer.valueOf(80), iri.getPort()); + assertEquals("/d/e", iri.getPath()); + assertEquals("f", iri.getQuery()); + assertEquals("g", iri.getFragment()); + iri = URL.create("https://a:b@c.org:80/d/e?f#g"); + assertEquals("https", iri.getScheme()); + assertEquals("a:b", iri.getUserInfo()); + assertEquals("c.org", iri.getHost()); + assertEquals(Integer.valueOf(80), iri.getPort()); + assertEquals("/d/e", iri.getPath()); + assertEquals("f", iri.getQuery()); + assertEquals("g", iri.getFragment()); + iri = URL.create("ftp://a:b@c.org:80/d/e?f#g"); + assertEquals("ftp", iri.getScheme()); + assertEquals("a:b", iri.getUserInfo()); + assertEquals("c.org", iri.getHost()); + assertEquals(Integer.valueOf(80), iri.getPort()); + assertEquals("/d/e", iri.getPath()); + assertEquals("f", iri.getQuery()); + assertEquals("g", iri.getFragment()); + iri = URL.create("mailto:joe@example.org?subject=foo"); + assertEquals("mailto", iri.getScheme()); + assertEquals(null, iri.getUserInfo()); + assertEquals(null, iri.getHost()); + assertEquals(null, iri.getPort()); + assertEquals("joe@example.org?subject=foo", iri.getSchemeSpecificPart()); + assertEquals(null, iri.getFragment()); + iri = URL.create("tag:example.org,2006:foo"); + assertEquals("tag", iri.getScheme()); + assertEquals(null, iri.getUserInfo()); + assertEquals(null, iri.getHost()); + assertEquals(null, iri.getPort()); + assertEquals("example.org,2006:foo", iri.getSchemeSpecificPart()); + assertEquals(null, iri.getQuery()); + assertEquals(null, iri.getFragment()); + iri = URL.create("urn:lsid:ibm.com:example:82437234964354895798234d"); + assertEquals("urn", iri.getScheme()); + assertEquals(null, iri.getUserInfo()); + assertEquals(null, iri.getHost()); + assertEquals(null, iri.getPort()); + assertEquals("lsid:ibm.com:example:82437234964354895798234d", iri.getSchemeSpecificPart()); + assertEquals(null, iri.getQuery()); + assertEquals(null, iri.getFragment()); + iri = URL.create("data:image/gif;base64,R0lGODdhMAAwAPAAAAAAAP"); + assertEquals("data", iri.getScheme()); + assertEquals(null, iri.getUserInfo()); + assertEquals(null, iri.getHost()); + assertEquals(null, iri.getPort()); + assertEquals("image/gif;base64,R0lGODdhMAAwAPAAAAAAAP", iri.getSchemeSpecificPart()); + assertEquals(null, iri.getQuery()); + assertEquals(null, iri.getFragment()); + } } diff --git a/net/src/test/java/org/xbib/net/OtherIRITest.java b/net/src/test/java/org/xbib/net/resource/IRITest.java similarity index 98% rename from net/src/test/java/org/xbib/net/OtherIRITest.java rename to net/src/test/java/org/xbib/net/resource/IRITest.java index a6b7a69..c6c56aa 100644 --- a/net/src/test/java/org/xbib/net/OtherIRITest.java +++ b/net/src/test/java/org/xbib/net/resource/IRITest.java @@ -1,14 +1,16 @@ -package org.xbib.net; +package org.xbib.net.resource; + +import org.junit.jupiter.api.Test; +import org.xbib.net.IRI; import java.net.URI; import java.net.URISyntaxException; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; -@Disabled -public class OtherIRITest { +class IRITest { @Test public void testSimple() throws Exception { @@ -199,4 +201,3 @@ public class OtherIRITest { } } - diff --git a/settings.gradle b/settings.gradle index 7984cf4..9c2cc5d 100644 --- a/settings.gradle +++ b/settings.gradle @@ -23,6 +23,7 @@ include 'net' include 'net-bouncycastle' include 'net-mime' include 'net-path' +include 'net-resource' include 'net-security' include 'net-socket' include 'benchmark'