From 7ef0e83364e2bc467bbed150a67070cc5ac81af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Prante?= Date: Fri, 21 Oct 2022 23:59:17 +0200 Subject: [PATCH] add IRI --- net/NOTICE.txt | 6 + net/src/main/java/org/xbib/net/IRI.java | 803 ++++++++++++++++++ .../java/org/xbib/net/IRISyntaxException.java | 17 + .../net/util/CharArrayCodepointIterator.java | 28 + .../util/CharSequenceCodepointIterator.java | 25 + .../java/org/xbib/net/util/CharUtils.java | 597 +++++++++++++ .../java/org/xbib/net/util/Codepoint.java | 92 ++ .../org/xbib/net/util/CodepointFilter.java | 10 + .../org/xbib/net/util/CodepointIterator.java | 268 ++++++ .../net/util/DelegatingCodepointIterator.java | 92 ++ .../net/util/InvalidCharacterException.java | 17 + .../main/java/org/xbib/net/util/Profile.java | 54 ++ .../net/util/RestrictedCodepointIterator.java | 83 ++ .../test/java/org/xbib/net/OtherIRITest.java | 202 +++++ 14 files changed, 2294 insertions(+) create mode 100644 net/src/main/java/org/xbib/net/IRI.java create mode 100644 net/src/main/java/org/xbib/net/IRISyntaxException.java create mode 100644 net/src/main/java/org/xbib/net/util/CharArrayCodepointIterator.java create mode 100644 net/src/main/java/org/xbib/net/util/CharSequenceCodepointIterator.java create mode 100644 net/src/main/java/org/xbib/net/util/CharUtils.java create mode 100644 net/src/main/java/org/xbib/net/util/Codepoint.java create mode 100644 net/src/main/java/org/xbib/net/util/CodepointFilter.java create mode 100644 net/src/main/java/org/xbib/net/util/CodepointIterator.java create mode 100644 net/src/main/java/org/xbib/net/util/DelegatingCodepointIterator.java create mode 100644 net/src/main/java/org/xbib/net/util/InvalidCharacterException.java create mode 100644 net/src/main/java/org/xbib/net/util/Profile.java create mode 100644 net/src/main/java/org/xbib/net/util/RestrictedCodepointIterator.java create mode 100644 net/src/test/java/org/xbib/net/OtherIRITest.java diff --git a/net/NOTICE.txt b/net/NOTICE.txt index e792c3b..fff25ae 100644 --- a/net/NOTICE.txt +++ b/net/NOTICE.txt @@ -19,3 +19,9 @@ The org.xbib.net.buffer "DataBuffer" classes are taken from Spring Framework, Co https://github.com/spring-projects/spring-framework/tree/main/spring-core/src/main/java/org/springframework/core/io/buffer License: Apacche 2.0 + +The IRI class is a modified version taken from org.apache.abdera.i18n.text + +https://abdera.apache.org + +License: Apacche 2.0 diff --git a/net/src/main/java/org/xbib/net/IRI.java b/net/src/main/java/org/xbib/net/IRI.java new file mode 100644 index 0000000..b09aff6 --- /dev/null +++ b/net/src/main/java/org/xbib/net/IRI.java @@ -0,0 +1,803 @@ +package org.xbib.net; + +import java.io.IOException; +import java.net.IDN; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.xbib.net.scheme.Scheme; +import org.xbib.net.scheme.SchemeRegistry; +import org.xbib.net.util.CharUtils; +import org.xbib.net.util.InvalidCharacterException; +import org.xbib.net.util.Profile; + +public class IRI implements Comparable { + + private static final Pattern IRIPATTERN = + Pattern.compile("^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\\?([^#]*))?(?:#(.*))?"); + + private final IRIBuilder builder; + + IRI(IRIBuilder builder) { + this.builder = builder; + } + + public static IRIBuilder builder() { + return new IRIBuilder(); + } + + public static IRI create(String iri) { + return IRI.builder().from(iri).build(); + } + + public String getScheme() { + return builder.scheme; + } + + public String getAuthority() { + return (builder.authority != null && builder.authority.length() > 0) ? builder.authority : null; + } + + public String getFragment() { + return builder.fragment; + } + + public String getHost() { + return (builder.host != null && builder.host.length() > 0) ? builder.host : null; + } + + public String getPath() { + return builder.path; + } + + public int getPort() { + return builder.port; + } + + public String getQuery() { + return builder.query; + } + + public String getSchemeSpecificPart() { + return builder.schemeSpecificPart; + } + + public String getUserInfo() { + return builder.userinfo; + } + + public boolean isAbsolute() { + return builder.scheme != null; + } + + public boolean isOpaque() { + return builder.path == null; + } + + public boolean isPathAbsolute() { + String s = getPath(); + return s != null && s.length() > 0 && s.charAt(0) == '/'; + } + + public boolean isSameDocumentReference() { + return builder.scheme == null && builder.authority == null + && (builder.path == null || builder.path.length() == 0 || ".".equals(builder.path)) + && builder.query == null; + } + + + public String getASCIIHost() { + return builder.getASCIIHost(); + } + + public String getASCIIAuthority() { + return builder.getASCIIAuthority(); + } + + public String getASCIIFragment() { + return builder.getASCIIFragment(); + } + + public String getASCIIPath() { + return builder.getASCIIPath(); + } + + public String getASCIIQuery() { + return builder.getASCIIQuery(); + } + + public String getASCIIUserInfo() { + return builder.getASCIIUserInfo(); + } + + public String getASCIISchemeSpecificPart() { + return builder.getASCIISchemeSpecificPart(); + } + + public IRI resolve(IRI iri) { + return resolve(this, iri); + } + + public IRI resolve(String iri) { + return resolve(this, IRI.builder().from(iri).build()); + } + + public static IRI resolve(IRI b, IRI c) { + if (c == null) { + return null; + } + if ("".equals(c.toString()) || "#".equals(c.toString()) + || ".".equals(c.toString()) + || "./".equals(c.toString())) { + return b; + } + if (b == null) { + return c; + } + if (c.isOpaque() || b.isOpaque()) { + return c; + } + if (c.isSameDocumentReference()) { + String cfragment = c.getFragment(); + String bfragment = b.getFragment(); + if ((cfragment == null && bfragment == null) || (cfragment != null && cfragment.equals(bfragment))) { + return b; + } else { + return IRI.builder() + .scheme(b.builder.scheme) + .authority(b.builder.authority) + .userinfo(b.builder.userinfo) + .host(b.builder.host) + .port(b.builder.port) + .path(normalizePath(b.builder.path)) + .query(b.builder.query) + .fragment(cfragment) + .build(); + } + } + if (c.isAbsolute()) { + return c; + } + String scheme = b.builder.scheme; + String query = c.getQuery(); + String fragment = c.getFragment(); + String userinfo; + String authority; + String host; + int port; + String path; + if (c.getAuthority() == null) { + authority = b.getAuthority(); + userinfo = b.getUserInfo(); + host = b.getHost(); + port = b.getPort(); + path = c.isPathAbsolute() ? normalizePath(c.getPath()) : resolve(b.getPath(), c.getPath()); + } else { + authority = c.getAuthority(); + userinfo = c.getUserInfo(); + host = c.getHost(); + port = c.getPort(); + path = normalizePath(c.getPath()); + } + return IRI.builder() + .scheme(scheme) + .authority(authority) + .userinfo(userinfo) + .host(host) + .port(port) + .path(path) + .query(query) + .fragment(fragment) + .build(); + } + + public static IRI relativize(IRI b, IRI c) { + if (c.isOpaque() || b.isOpaque()) { + return c; + } + if ((b.builder.scheme == null && c.builder.scheme != null) || (b.builder.scheme != null && c.builder.scheme == null) + || (b.builder.scheme != null && !b.builder.scheme.equalsIgnoreCase(c.builder.scheme))) { + return c; + } + String bpath = normalizePath(b.getPath()); + String cpath = normalizePath(c.getPath()); + if (!bpath.equals(cpath)) { + if (bpath.charAt(bpath.length() - 1) != '/') { + bpath += "/"; + } + if (!cpath.startsWith(bpath)) { + return c; + } + } + return IRI.builder() + .scheme(null) + .authority(null) + .userinfo(null) + .host(null) + .port(-1) + .path(normalizePath(cpath.substring(bpath.length()))) + .query(c.getQuery()) + .fragment(c.getFragment()) + .build(); + } + + private static String normalizePath(String path) { + if (path == null || path.length() == 0) { + return "/"; + } + String[] segments = path.split("/"); + if (segments.length < 2) { + return path; + } + StringBuilder buf = new StringBuilder("/"); + for (int n = 0; n < segments.length; n++) { + String segment = segments[n].intern(); + if (".".equals(segment)) { + segments[n] = null; + } + } + PercentDecoder percentDecoder = new PercentDecoder(); + for (String segment : segments) { + if (segment != null) { + if (buf.length() > 1) { + buf.append('/'); + } + try { + buf.append(PercentEncoders.getMatrixEncoder(StandardCharsets.UTF_8).encode(percentDecoder.decode(segment))); + } catch (IOException e) { + //logger.log(Level.FINE, e.getMessage(), e); + } + } + } + if (path.endsWith("/") || path.endsWith("/.")) { + buf.append('/'); + } + return buf.toString(); + } + + private static String resolve(String bpath, String cpath) { + if (bpath == null && cpath == null) { + return null; + } + if (bpath == null) { + return (!cpath.startsWith("/")) ? "/" + cpath : cpath; + } + if (cpath == null) { + return bpath; + } + StringBuilder buf = new StringBuilder(""); + int n = bpath.lastIndexOf('/'); + if (n > -1) { + buf.append(bpath, 0, n + 1); + } + if (cpath.length() != 0) { + buf.append(cpath); + } + if (buf.charAt(0) != '/') { + buf.insert(0, '/'); + } + return normalizePath(buf.toString()); + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(); + String s = getScheme(); + if (s != null && !s.isEmpty()) { + buf.append(s).append(':'); + } + buf.append(getSchemeSpecificPart()); + return buf.toString(); + } + + public String toEncodedString() throws IOException { + return PercentEncoders.getUnreservedEncoder(StandardCharsets.UTF_8).encode(toString()); + } + + public String toASCIIString() { + StringBuilder buf = new StringBuilder(); + String s = getScheme(); + if (s != null && !s.isEmpty()) { + buf.append(s).append(':'); + } + buf.append(getASCIISchemeSpecificPart()); + return buf.toString(); + } + + public String toBIDIString() { + return CharUtils.wrapBidi(toString(), CharUtils.LRE); + } + + public URI toURI() throws URISyntaxException { + return new URI(toASCIIString()); + } + + public java.net.URL toURL() throws MalformedURLException, URISyntaxException { + return toURI().toURL(); + } + + @Override + public int hashCode() { + final int p = 31; + int result = 1; + result = p * result + ((builder.authority == null) ? 0 : builder.authority.hashCode()); + result = p * result + ((builder.fragment == null) ? 0 : builder.fragment.hashCode()); + result = p * result + ((builder.host == null) ? 0 : builder.host.hashCode()); + result = p * result + ((builder.path == null) ? 0 : builder.path.hashCode()); + result = p * result + builder.port; + result = p * result + ((builder.query == null) ? 0 : builder.query.hashCode()); + result = p * result + ((builder.scheme == null) ? 0 : builder.scheme.hashCode()); + result = p * result + ((builder.userinfo == null) ? 0 : builder.userinfo.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final IRI other = (IRI) obj; + if (builder.authority == null) { + if (other.builder.authority != null) { + return false; + } + } else if (!builder.authority.equals(other.builder.authority)) { + return false; + } + if (builder.fragment == null) { + if (other.builder.fragment != null) { + return false; + } + } else if (!builder.fragment.equals(other.builder.fragment)) { + return false; + } + if (builder.host == null) { + if (other.builder.host != null) { + return false; + } + } else if (!builder.host.equals(other.builder.host)) { + return false; + } + if (builder.path == null) { + if (other.builder.path != null) { + return false; + } + } else if (!builder.path.equals(other.builder.path)) { + return false; + } + if (builder.port != other.builder.port) { + return false; + } + if (builder.query == null) { + if (other.builder.query != null) { + return false; + } + } else if (!builder.query.equals(other.builder.query)) { + return false; + } + if (builder.scheme == null) { + if (other.builder.scheme != null) { + return false; + } + } else if (!builder.scheme.equals(other.builder.scheme)) { + return false; + } + if (builder.userinfo == null) { + return other.builder.userinfo == null; + } else { + return builder.userinfo.equals(other.builder.userinfo); + } + } + + @Override + public int compareTo(IRI that) { + int c; + if ((c = compareIgnoringCase(builder.scheme, that.builder.scheme)) != 0) { + return c; + } + if (isOpaque()) { + if (that.isOpaque()) { + // Both opaque + if ((c = compare(builder.schemeSpecificPart, that.builder.schemeSpecificPart)) != 0) { + return c; + } + return compare(builder.fragment, that.builder.fragment); + } + return +1; + } else if (that.isOpaque()) { + return -1; + } + // Hierarchical + if ((builder.host != null) && (that.builder.host != null)) { + // Both server-based + if ((c = compare(builder.userinfo, that.builder.userinfo)) != 0) { + return c; + } + if ((c = compareIgnoringCase(builder.host, that.builder.host)) != 0) { + return c; + } + if ((c = builder.port - that.builder.port) != 0) { + return c; + } + } else { + if ((c = compare(builder.authority, that.builder.authority)) != 0) { + return c; + } + } + if ((c = compare(builder.path, that.builder.path)) != 0) { + return c; + } + if ((c = compare(builder.query, that.builder.query)) != 0) { + return c; + } + return compare(builder.fragment, that.builder.fragment); + } + + private int compare(String s, String t) { + if (s != null) { + if (s.equals(t)) { + return 0; + } + if (t != null) { + return s.compareTo(t); + } else { + return +1; + } + } else { + return -1; + } + } + + private int compareIgnoringCase(String s, String t) { + if (s != null) { + if (s.equals(t)) { + return 0; + } + if (t != null) { + int sn = s.length(); + int tn = t.length(); + int n = Math.min(sn, tn); + for (int i = 0; i < n; i++) { + int c = toLower(s.charAt(i)) - toLower(t.charAt(i)); + if (c != 0) { + return c; + } + } + return sn - tn; + } + return +1; + } else { + return -1; + } + } + + private int toLower(char c) { + if ((c >= 'A') && (c <= 'Z')) { + return c + ('a' - 'A'); + } + return c; + } + + /** + * + */ + public static class IRIBuilder { + + final SchemeRegistry reg = SchemeRegistry.getInstance(); + + Scheme schemeClass; + + String scheme; + + String schemeSpecificPart; + + String authority; + + String userinfo; + + String host; + + int port = -1; + + String path; + + String query; + + String fragment; + + private String asciiHost; + + private String asciiAuthority; + + private String asciiUserinfo; + + private String asciiSchemeSpecificPart; + + private String asciiPath; + + private String asciiQuery; + + private String asciiFragment; + + private IRIBuilder() { + } + + public IRIBuilder from(String string) { + parse(CharUtils.stripBidi(string)); + authorityAndSchemeSpecificPart(); + return this; + } + + public IRIBuilder from(URI uri) { + scheme = uri.getScheme(); + schemeClass = reg.getScheme(scheme); + authority = uri.getAuthority(); + path = uri.getPath(); + query = uri.getQuery(); + fragment = uri.getFragment(); + parseAuthority(); + authorityAndSchemeSpecificPart(); + return this; + } + + public IRIBuilder from(IRI uri) { + scheme = uri.getScheme(); + schemeClass = reg.getScheme(scheme); + authority = uri.getAuthority(); + path = uri.getPath(); + query = uri.getQuery(); + fragment = uri.getFragment(); + parseAuthority(); + authorityAndSchemeSpecificPart(); + return this; + } + + public IRIBuilder from(String scheme, String schemeSpecificPart, String fragment) { + this.scheme = scheme.toLowerCase(); + this.schemeSpecificPart = schemeSpecificPart; + this.fragment = fragment; + authorityAndSchemeSpecificPart(); + return this; + } + + public IRIBuilder scheme(String scheme) { + this.scheme = scheme; + this.schemeClass = reg.getScheme(scheme); + return this; + } + + public IRIBuilder schemeSpecificPart(String schemeSpecificPart) { + this.schemeSpecificPart = schemeSpecificPart; + return this; + } + + public IRIBuilder curie(String prefix, String path) { + this.scheme = prefix; + this.path = path; + return this; + } + + public IRIBuilder curie(String schemeAndPath) { + int pos = schemeAndPath.indexOf(':'); + this.scheme = pos > 0 ? schemeAndPath.substring(0, pos) : null; + this.path = pos > 0 ? schemeAndPath.substring(pos + 1) : schemeAndPath; + return this; + } + + public IRIBuilder authority(String authority) { + this.authority = authority; + return this; + } + + public IRIBuilder userinfo(String userinfo) { + this.userinfo = userinfo; + return this; + } + + public IRIBuilder host(String host) { + this.host = host; + return this; + } + + public IRIBuilder port(int port) { + this.port = port; + return this; + } + + public IRIBuilder path(String path) { + this.path = path; + return this; + } + + public IRIBuilder query(String query) { + this.query = query; + return this; + } + + public IRIBuilder fragment(String fragment) { + this.fragment = fragment; + return this; + } + + public IRI build() { + return new IRI(this); + } + + private void parse(String iri) { + try { + Matcher irim = IRIPATTERN.matcher(iri); + if (irim.find()) { + scheme = irim.group(1); + schemeClass = reg.getScheme(scheme); + authority = irim.group(2); + path = irim.group(3); + query = irim.group(4); + fragment = irim.group(5); + parseAuthority(); + try { + CharUtils.verify(scheme, Profile.SCHEME); + CharUtils.verify(path, Profile.IPATH); + CharUtils.verify(query, Profile.IQUERY); + CharUtils.verify(fragment, Profile.IFRAGMENT); + } catch (InvalidCharacterException e) { + throw new IRISyntaxException(e); + } + } else { + throw new IRISyntaxException("invalid Syntax"); + } + } catch (IRISyntaxException e) { + throw e; + } catch (Exception e) { + throw new IRISyntaxException(e); + } + } + + private void parseAuthority() { + if (authority != null) { + // [ '@' ] [ ':' ] + int pos = authority.lastIndexOf('@'); + userinfo = pos >= 0 ? authority.substring(0, pos) : null; + String s = pos >= 0 ? authority.substring(pos + 1) : authority; + pos = s.indexOf(':'); + host = pos >= 0 ? s.substring(0, pos) : s; + port = pos >= 0 ? Integer.parseInt(s.substring(pos + 1)) : -1; + try { + CharUtils.verify(userinfo, Profile.IUSERINFO); + CharUtils.verify(host, Profile.IHOST); + } catch (InvalidCharacterException e) { + throw new IRISyntaxException(e); + } + } + } + + private void authorityAndSchemeSpecificPart() { + if (authority == null && (userinfo != null || host != null)) { + StringBuilder buf = new StringBuilder(); + buildAuthority(buf, userinfo, host, port); + authority = (buf.length() != 0) ? buf.toString() : null; + } + StringBuilder buf = new StringBuilder(); + buildSchemeSpecificPart(buf, authority, path, query, fragment); + schemeSpecificPart = buf.toString(); + } + + private static void buildSchemeSpecificPart(StringBuilder buf, String authority, String path, String query, + String fragment) { + if (authority != null) { + buf.append("//"); + buf.append(authority); + } + if (path != null && path.length() > 0) { + buf.append(path); + } + if (query != null) { + buf.append('?'); + buf.append(query); + } + if (fragment != null) { + buf.append('#'); + buf.append(fragment); + } + } + + public String getASCIIHost() { + if (host != null && asciiHost == null) { + if (host.startsWith("[")) { + asciiHost = host; + } else { + asciiHost = IDN.toASCII(host); + } + } + return (asciiHost != null && asciiHost.length() > 0) ? asciiHost : null; + } + + private String getASCIIAuthority() { + if (authority != null && asciiAuthority == null) { + asciiAuthority = buildASCIIAuthority(); + } + return asciiAuthority != null && asciiAuthority.length() > 0 ? asciiAuthority : null; + } + + private String buildASCIIAuthority() { + StringBuilder buf = new StringBuilder(); + buildAuthority(buf, getASCIIUserInfo(), getASCIIHost(), port); + return buf.toString(); + } + + private static void buildAuthority(StringBuilder buf, String aui, String ah, int port) { + if (aui != null && aui.length() != 0) { + buf.append(aui); + buf.append('@'); + } + if (ah != null && ah.length() != 0) { + buf.append(ah); + } + if (port != -1) { + buf.append(':'); + buf.append(port); + } + } + + private String getASCIIFragment() { + if (fragment != null && asciiFragment == null) { + try { + asciiFragment = PercentEncoders.getFragmentEncoder(StandardCharsets.UTF_8).encode(fragment); + } catch (IOException e) { + //logger.log(Level.FINE, e.getMessage(), e); + } + } + return asciiFragment; + } + + private String getASCIIPath() { + if (path != null && asciiPath == null) { + try { + asciiPath = PercentEncoders.getPathEncoder(StandardCharsets.UTF_8).encode(path); + } catch (IOException e) { + //logger.log(Level.FINE, e.getMessage(), e); + } + } + return asciiPath; + } + + public String getASCIIQuery() { + if (query != null && asciiQuery == null) { + try { + asciiQuery = PercentEncoders.getQueryEncoder(StandardCharsets.UTF_8).encode(query); + } catch (IOException e) { + //logger.log(Level.FINE, e.getMessage(), e); + } + } + return asciiQuery; + } + + public String getASCIIUserInfo() { + if (userinfo != null && asciiUserinfo == null) { + try { + asciiUserinfo = PercentEncoders.getUnreservedEncoder(StandardCharsets.UTF_8).encode(userinfo); + } catch (IOException e) { + //logger.log(Level.FINE, e.getMessage(), e); + } + } + return asciiUserinfo; + } + + public String getASCIISchemeSpecificPart() { + if (asciiSchemeSpecificPart == null) { + StringBuilder buf = new StringBuilder(); + buildSchemeSpecificPart(buf, getASCIIAuthority(), getASCIIPath(), getASCIIQuery(), getASCIIFragment()); + asciiSchemeSpecificPart = buf.toString(); + } + return asciiSchemeSpecificPart; + } + } +} diff --git a/net/src/main/java/org/xbib/net/IRISyntaxException.java b/net/src/main/java/org/xbib/net/IRISyntaxException.java new file mode 100644 index 0000000..bf90e63 --- /dev/null +++ b/net/src/main/java/org/xbib/net/IRISyntaxException.java @@ -0,0 +1,17 @@ +package org.xbib.net; + +/** + * + */ +@SuppressWarnings("serial") +public class IRISyntaxException extends RuntimeException { + + IRISyntaxException(String message) { + super(message); + } + + IRISyntaxException(Throwable cause) { + super(cause); + } + +} diff --git a/net/src/main/java/org/xbib/net/util/CharArrayCodepointIterator.java b/net/src/main/java/org/xbib/net/util/CharArrayCodepointIterator.java new file mode 100644 index 0000000..f8f4d65 --- /dev/null +++ b/net/src/main/java/org/xbib/net/util/CharArrayCodepointIterator.java @@ -0,0 +1,28 @@ +package org.xbib.net.util; + +class CharArrayCodepointIterator extends CodepointIterator { + protected char[] buffer; + + CharArrayCodepointIterator(char[] buffer) { + this(buffer, 0, buffer.length); + } + + CharArrayCodepointIterator(char[] buffer, int n, int e) { + this.buffer = buffer; + this.position = n; + this.limit = Math.min(buffer.length - n, e); + } + + @Override + protected char get() { + return (position < limit) ? buffer[position++] : (char) -1; + } + + @Override + protected char get(int index) { + if (index < 0 || index >= limit) { + throw new ArrayIndexOutOfBoundsException(index); + } + return buffer[index]; + } +} diff --git a/net/src/main/java/org/xbib/net/util/CharSequenceCodepointIterator.java b/net/src/main/java/org/xbib/net/util/CharSequenceCodepointIterator.java new file mode 100644 index 0000000..8efeb21 --- /dev/null +++ b/net/src/main/java/org/xbib/net/util/CharSequenceCodepointIterator.java @@ -0,0 +1,25 @@ +package org.xbib.net.util; + +class CharSequenceCodepointIterator extends CodepointIterator { + private final CharSequence buffer; + + CharSequenceCodepointIterator(CharSequence buffer) { + this(buffer, 0, buffer.length()); + } + + CharSequenceCodepointIterator(CharSequence buffer, int n, int e) { + this.buffer = buffer; + this.position = n; + this.limit = Math.min(buffer.length() - n, e); + } + + @Override + protected char get() { + return buffer.charAt(position++); + } + + @Override + protected char get(int index) { + return buffer.charAt(index); + } +} diff --git a/net/src/main/java/org/xbib/net/util/CharUtils.java b/net/src/main/java/org/xbib/net/util/CharUtils.java new file mode 100644 index 0000000..7e30eb1 --- /dev/null +++ b/net/src/main/java/org/xbib/net/util/CharUtils.java @@ -0,0 +1,597 @@ +package org.xbib.net.util; + +/** + * General utilities for dealing with Unicode characters. + */ +public final class CharUtils { + + public static final char LRE = 0x202A; + public static final char RLE = 0x202B; + public static final char LRO = 0x202D; + public static final char RLO = 0x202E; + public static final char LRM = 0x200E; + public static final char RLM = 0x200F; + public static final char PDF = 0x202C; + + private CharUtils() { + } + + /** + * True if the character is a valid unicode codepoint. + * @param c char + * @return true if the character is a valid unicode codepoint + */ + public static boolean isValid(int c) { + return c >= 0x000000 && c <= 0x10ffff; + } + + /** + * True if the character is a valid unicode codepoint. + * @param c code point + * @return true if the character is a valid unicode codepoint + */ + public static boolean isValid(Codepoint c) { + return isValid(c.getValue()); + } + + /** + * True if all the characters in chars are within the set [low,high]. + * @param chars chars + * @param low low + * @param high high + * @return true if all the characters in chars are within the set [low,high] + */ + public static boolean inRange(char[] chars, char low, char high) { + for (char aChar : chars) { + if (aChar < low || aChar > high) { + return false; + } + } + return true; + } + + /** + * True if all the characters in chars are within the set [low,high]. + * @param chars chars + * @param low low + * @param high high + * @return true if all the characters in chars are within the set [low,high] + */ + public static boolean inRange(char[] chars, int low, int high) { + for (int i = 0; i < chars.length; i++) { + char n = chars[i]; + Codepoint cp = + (isHighSurrogate(n) && i + 1 < chars.length && isLowSurrogate(chars[i + 1])) + ? toSupplementary(n, chars[i++]) : new Codepoint(n); + int c = cp.getValue(); + if (c < low || c > high) { + return false; + } + } + return true; + } + + /** + * True if the codepoint is within the set [low,high]. + * @param codepoint code point + * @param low low + * @param high high + * @return true if the codepoint is within the set [low,high] + */ + public static boolean inRange(int codepoint, int low, int high) { + return codepoint >= low && codepoint <= high; + } + + /** + * Get the high surrogate for a particular unicode codepoint. + * @param c char + * @return high surrugate + */ + public static char getHighSurrogate(int c) { + return c >= 0x10000 ? (char) ((0xD800 - (0x10000 >> 10)) + (c >> 10)) : 0; + } + + /** + * Get the low surrogate for a particular unicode codepoint. + * @param c char + * @return low surrogate + */ + public static char getLowSurrogate(int c) { + return c >= 0x10000 ? (char) (0xDC00 + (c & 0x3FF)) : (char) c; + } + + /** + * True if the specified char is a high surrogate. + * @param c char + * @return true if the specified char is a high surrogate + */ + public static boolean isHighSurrogate(char c) { + return c <= '\uDBFF' && c >= '\uD800'; + } + + /** + * True if the specified char is a low surrogate. + * @param c char + * @return true if the specified char is a low surrogate + */ + public static boolean isLowSurrogate(char c) { + return c <= '\uDFFF' && c >= '\uDC00'; + } + + /** + * True if the specified character is supplemental. + * @param c char + * @return true if the specified character is supplemental + */ + public static boolean isSupplementary(int c) { + return c <= 0x10ffff && c >= 0x010000; + } + + /** + * True if the two chars represent a surrogate pair. + * @param high high char + * @param low low char + * @return true if the two chars represent a surrogate pair + */ + public static boolean isSurrogatePair(char high, char low) { + return isHighSurrogate(high) && isLowSurrogate(low); + } + + /** + * Converts the high and low surrogate into a supplementary codepoint. + * @param high high char + * @param low low char + * @return code point + */ + public static Codepoint toSupplementary(char high, char low) { + if (!isHighSurrogate(high)) { + throw new IllegalArgumentException("Invalid High Surrogate"); + } + if (!isLowSurrogate(low)) { + throw new IllegalArgumentException("Invalid Low Surrogate"); + } + return new Codepoint(((high - '\uD800') << 10) + (low - '\uDC00') + 0x010000); + } + + /** + * Return the codepoint at the given location, automatically dealing with surrogate pairs. + * @param s string + * @param i location + * @return code point + */ + public static Codepoint codepointAt(String s, int i) { + char c = s.charAt(i); + if (c < 0xD800 || c > 0xDFFF) { + return new Codepoint(c); + } + if (isHighSurrogate(c) && s.length() != i) { + char low = s.charAt(i + 1); + if (isLowSurrogate(low)) { + return toSupplementary(c, low); + } + } else if (isLowSurrogate(c) && i >= 1) { + char high = s.charAt(i - 1); + if (isHighSurrogate(high)) { + return toSupplementary(high, c); + } + } + return new Codepoint(c); + } + + /** + * Return the number of characters used to represent the codepoint (will return 1 or 2). + * @param c code point + * @return the number of characters used to represent the codepoint + */ + public static int length(Codepoint c) { + return c.getCharCount(); + } + + /** + * Return the number of characters used to represent the codepoint (will return 1 or 2). + * @param c code point + * @return the number of characters used to represent the codepoint + */ + public static int length(int c) { + return new Codepoint(c).getCharCount(); + } + + /** + * Return the total number of codepoints in the buffer. Each surrogate pair counts as a single codepoint. + * @param c code point + * @return the total number of codepoints in the buffer + */ + public static int length(CharSequence c) { + return length(CodepointIterator.forCharSequence(c)); + } + + /** + * Return the total number of codepoints in the buffer. Each surrogate pair counts as a single codepoint. + * @param c chars + * @return the total number of codepoints in the buffer + */ + public static int length(char[] c) { + return length(CodepointIterator.forCharArray(c)); + } + + private static int length(CodepointIterator ci) { + int n = 0; + while (ci.hasNext()) { + ci.next(); + n++; + } + return n; + } + + private static String supplementaryToString(int c) { + return String.valueOf(getHighSurrogate(c)) + getLowSurrogate(c); + } + + /** + * Return the String representation of the codepoint, automatically dealing with surrogate pairs. + * @param c char + * @return string representation of the codepoint + */ + public static String toString(int c) { + return isSupplementary(c) ? supplementaryToString(c) : String.valueOf((char) c); + } + + /** + * Removes leading and trailing bidi controls from the string. + * @param string string + * @return string without bidi controls + */ + public static String stripBidi(String string) { + String s = string; + if (s == null || s.length() <= 1) { + return s; + } + if (isBidi(s.charAt(0))) { + s = s.substring(1); + } + if (isBidi(s.charAt(s.length() - 1))) { + s = s.substring(0, s.length() - 1); + } + return s; + } + + private static String wrap(String s, char c1, char c2) { + StringBuilder buf = new StringBuilder(s); + if (buf.length() > 1) { + if (buf.charAt(0) != c1) { + buf.insert(0, c1); + } + if (buf.charAt(buf.length() - 1) != c2) { + buf.append(c2); + } + } + return buf.toString(); + } + + /** + * Wrap the string with the specified bidi control. + * @param s string + * @param c char + * @return string with specified bidi control + */ + public static String wrapBidi(String s, char c) { + switch (c) { + case RLE: + return wrap(s, RLE, PDF); + case RLO: + return wrap(s, RLO, PDF); + case LRE: + return wrap(s, LRE, PDF); + case LRO: + return wrap(s, LRO, PDF); + case RLM: + return wrap(s, RLM, RLM); + case LRM: + return wrap(s, LRM, LRM); + default: + return s; + } + } + + /** + * True if the codepoint is a digit. + * @param codepoint code point + * @return true if the codepoint is a digit + */ + public static boolean isDigit(int codepoint) { + return inRange(codepoint, '0', '9'); + } + + /** + * True if the codepoint is part of the ASCII alphabet (a-z, A-Z). + * @param codepoint code point + * @return true if the codepoint is a digit + */ + public static boolean isAlpha(int codepoint) { + return inRange(codepoint, 'A', 'Z') || inRange(codepoint, 'a', 'z'); + } + + /** + * True if isAlpha and isDigit both return true. + * @param codepoint code point + * @return true if isAlpha and isDigit both return true + */ + public static boolean isAlphaDigit(int codepoint) { + return isDigit(codepoint) || isAlpha(codepoint); + } + + public static boolean isHex(int codepoint) { + return isDigit(codepoint) || inRange(codepoint, 'a', 'f') || inRange(codepoint, 'A', 'F'); + } + + /** + * True if the codepoint is a bidi control character. + * @param codepoint code point + * @return true if the codepoint is a bidi control character + */ + public static boolean isBidi(int codepoint) { + return codepoint == LRM || + codepoint == RLM || + codepoint == LRE || + codepoint == RLE || + codepoint == LRO || + codepoint == RLO || + codepoint == PDF; + } + + public static boolean isPctEnc(int codepoint) { + return codepoint == '%' || isDigit(codepoint) || + inRange(codepoint, 'A', 'F') || + inRange(codepoint, 'a', 'f'); + } + + public static boolean isMark(int codepoint) { + return codepoint == '-' || + codepoint == '_' || + codepoint == '.' || + codepoint == '!' || + codepoint == '~' || + codepoint == '*' || + codepoint == '\\' || + codepoint == '\'' || + codepoint == '(' || + codepoint == ')'; + } + + public static boolean isUnreserved(int codepoint) { + return isAlphaDigit(codepoint) || + codepoint == '-' || + codepoint == '.' || + codepoint == '_' || + codepoint == '~'; + } + + public static boolean isReserved(int codepoint) { + return codepoint == '$' || + codepoint == '&' || + codepoint == '+' || + codepoint == ',' || + codepoint == '/' || + codepoint == ':' || + codepoint == ';' || + codepoint == '=' || + codepoint == '?' || + codepoint == '@' || + codepoint == '[' || + codepoint == ']'; + } + + public static boolean isGenDelim(int codepoint) { + return codepoint == '#' || codepoint == '/' + || codepoint == ':' + || codepoint == '?' + || codepoint == '@' + || codepoint == '[' + || codepoint == ']'; + } + + public static boolean isSubDelim(int codepoint) { + return codepoint == '!' || + codepoint == '$' || + codepoint == '&' || + codepoint == '\'' || + codepoint == '(' || + codepoint == ')' || + codepoint == '*' || + codepoint == '+' || + codepoint == ',' || + codepoint == ';' || + codepoint == '=' || + codepoint == '\\'; + } + + public static boolean isPchar(int codepoint) { + return isUnreserved(codepoint) || codepoint == ':' + || codepoint == '@' + || codepoint == '&' + || codepoint == '=' + || codepoint == '+' + || codepoint == '$' + || codepoint == ','; + } + + public static boolean isPath(int codepoint) { + return isPchar(codepoint) || codepoint == ';' || codepoint == '/' || codepoint == '%' || codepoint == ','; + } + + public static boolean isPathNoDelims(int codepoint) { + return isPath(codepoint) && !isGenDelim(codepoint); + } + + public static boolean isScheme(int codepoint) { + return isAlphaDigit(codepoint) || codepoint == '+' || codepoint == '-' || codepoint == '.'; + } + + public static boolean isUserInfo(int codepoint) { + return isUnreserved(codepoint) || isSubDelim(codepoint) || isPctEnc(codepoint); + } + + public static boolean isQuery(int codepoint) { + return isPchar(codepoint) || codepoint == ';' || codepoint == '/' || codepoint == '?' || codepoint == '%'; + } + + public static boolean isFragment(int codepoint) { + return isPchar(codepoint) || codepoint == '/' || codepoint == '?' || codepoint == '%'; + } + + public static boolean isUcsChar(int codepoint) { + return inRange(codepoint, '\u00A0', '\uD7FF') || + inRange(codepoint, '\uF900', '\uFDCF') || + inRange(codepoint, '\uFDF0', '\uFFEF') || + inRange(codepoint, 0x10000, 0x1FFFD) || + inRange(codepoint, 0x20000, 0x2FFFD) || + inRange(codepoint, 0x30000, 0x3FFFD) || + inRange(codepoint, 0x40000, 0x4FFFD) || + inRange(codepoint, 0x50000, 0x5FFFD) || + inRange(codepoint, 0x60000, 0x6FFFD) || + inRange(codepoint, 0x70000, 0x7FFFD) || + inRange(codepoint, 0x80000, 0x8FFFD) || + inRange(codepoint, 0x90000, 0x9FFFD) || + inRange(codepoint, 0xA0000, 0xAFFFD) || + inRange(codepoint, 0xB0000, 0xBFFFD) || + inRange(codepoint, 0xC0000, 0xCFFFD) || + inRange(codepoint, 0xD0000, 0xDFFFD) || + inRange(codepoint, 0xE1000, 0xEFFFD); + } + + public static boolean isIprivate(int codepoint) { + return inRange(codepoint, '\uE000', '\uF8FF') || + inRange(codepoint, 0xF0000, 0xFFFFD) || + inRange(codepoint, 0x100000, 0x10FFFD); + } + + public static boolean isIunreserved(int codepoint) { + return isAlphaDigit(codepoint) || isMark(codepoint) || isUcsChar(codepoint); + } + + public static boolean isIpchar(int codepoint) { + return isIunreserved(codepoint) || + isSubDelim(codepoint) || + codepoint == ':' || + codepoint == '@' || + codepoint == '&' || + codepoint == '=' || + codepoint == '+' || + codepoint == '$'; + } + + public static boolean isIpath(int codepoint) { + return isIpchar(codepoint) || + codepoint == ';' || + codepoint == '/' || + codepoint == '%' || + codepoint == ','; + } + + public static boolean isIpathnodelims(int codepoint) { + return isIpath(codepoint) && !isGenDelim(codepoint); + } + + public static boolean isIquery(int codepoint) { + return isIpchar(codepoint) || + isIprivate(codepoint) || + codepoint == ';' || + codepoint == '/' || + codepoint == '?' || + codepoint == '%'; + } + + public static boolean isIfragment(int codepoint) { + return isIpchar(codepoint) || isIprivate(codepoint) + || codepoint == '/' + || codepoint == '?' + || codepoint == '%'; + } + + public static boolean isIregname(int codepoint) { + return isIunreserved(codepoint) || codepoint == '!' + || codepoint == '$' + || codepoint == '&' + || codepoint == '\'' + || codepoint == '(' + || codepoint == ')' + || codepoint == '*' + || codepoint == '+' + || codepoint == ',' + || codepoint == ';' + || codepoint == '=' + || codepoint == '"'; + } + + public static boolean isIpliteral(int codepoint) { + return isHex(codepoint) || codepoint == ':' + || codepoint == '[' + || codepoint == ']'; + } + + public static boolean isIhost(int codepoint) { + return isIregname(codepoint) || isIpliteral(codepoint); + } + + public static boolean isRegname(int codepoint) { + return isUnreserved(codepoint) || codepoint == '!' + || codepoint == '$' + || codepoint == '&' + || codepoint == '\'' + || codepoint == '(' + || codepoint == ')' + || codepoint == '*' + || codepoint == '+' + || codepoint == ',' + || codepoint == ';' + || codepoint == '=' + || codepoint == '"'; + } + + public static boolean isIuserinfo(int codepoint) { + return isIunreserved(codepoint) || codepoint == ';' + || codepoint == ':' + || codepoint == '&' + || codepoint == '=' + || codepoint == '+' + || codepoint == '$' + || codepoint == ','; + } + + public static boolean isIserver(int codepoint) { + return isIuserinfo(codepoint) || isIregname(codepoint) + || isAlphaDigit(codepoint) + || codepoint == '.' + || codepoint == ':' + || codepoint == '@' + || codepoint == '[' + || codepoint == ']' + || codepoint == '%' + || codepoint == '-'; + } + + /** + * Verifies a sequence of codepoints using the specified filter. + * @param ci code point iterator + * @param profile profile + */ + public static void verify(CodepointIterator ci, Profile profile) { + CodepointIterator rci = CodepointIterator.restrict(ci, profile.filter()); + while (rci.hasNext()) { + rci.next(); + } + } + + /** + * Verifies a sequence of codepoints using the specified profile. + * @param s string + * @param profile profile + */ + public static void verify(String s, Profile profile) { + if (s == null) { + return; + } + verify(CodepointIterator.forCharSequence(s), profile); + } + +} diff --git a/net/src/main/java/org/xbib/net/util/Codepoint.java b/net/src/main/java/org/xbib/net/util/Codepoint.java new file mode 100644 index 0000000..af39f37 --- /dev/null +++ b/net/src/main/java/org/xbib/net/util/Codepoint.java @@ -0,0 +1,92 @@ +package org.xbib.net.util; + +/** + * Represents a single Unicode Codepoint. + */ +public class Codepoint implements Comparable { + + private final int value; + + /** + * Create a codepoint from a single char. + * @param value char + */ + public Codepoint(char value) { + this((int) value); + } + + /** + * Create a codepoint from a specific integer value. + * @param value value + */ + public Codepoint(int value) { + if (value < 0) { + throw new IllegalArgumentException("invalid codepoint"); + } + this.value = value; + } + + /** + * The codepoint value. + * @return value + */ + public int getValue() { + return value; + } + + @Override + public int compareTo(Codepoint o) { + return value < o.value ? -1 : value == o.value ? 0 : 1; + } + + @Override + public String toString() { + return CharUtils.toString(value); + } + + public char[] toChars() { + return toString().toCharArray(); + } + + /** + * Get the number of chars necessary to represent this codepoint. Returns 2 if this is a supplementary codepoint. + * @return char count + */ + public int getCharCount() { + return toChars().length; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + value; + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final Codepoint other = (Codepoint) obj; + return value == other.value; + } + + /** + * Get the next codepoint. + * @return next code point + */ + public Codepoint next() { + if (value == 0x10ffff) { + throw new IndexOutOfBoundsException(); + } + return new Codepoint(value + 1); + } +} diff --git a/net/src/main/java/org/xbib/net/util/CodepointFilter.java b/net/src/main/java/org/xbib/net/util/CodepointFilter.java new file mode 100644 index 0000000..d5a7f55 --- /dev/null +++ b/net/src/main/java/org/xbib/net/util/CodepointFilter.java @@ -0,0 +1,10 @@ +package org.xbib.net.util; + +/** + * Filters are used in a variety of ways to filter or verify unicode codepoints. + */ +@FunctionalInterface +public interface CodepointFilter { + + boolean accept(int ch); +} diff --git a/net/src/main/java/org/xbib/net/util/CodepointIterator.java b/net/src/main/java/org/xbib/net/util/CodepointIterator.java new file mode 100644 index 0000000..6da70bc --- /dev/null +++ b/net/src/main/java/org/xbib/net/util/CodepointIterator.java @@ -0,0 +1,268 @@ +package org.xbib.net.util; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + * Provides an iterator over Unicode Codepoints. + */ +public abstract class CodepointIterator implements Iterator { + + protected int position = -1; + + protected int limit = -1; + + public CodepointIterator() { + } + + /** + * Get a CodepointIterator for the specified char array. + * @param array char array + * @return code point iterator + */ + public static CodepointIterator forCharArray(char[] array) { + return new CharArrayCodepointIterator(array); + } + + /** + * Get a CodepointIterator for the specified CharSequence. + * @param seq char sequence + * @return code point iterator + */ + public static CodepointIterator forCharSequence(CharSequence seq) { + return new CharSequenceCodepointIterator(seq); + } + + public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter) { + return new RestrictedCodepointIterator(ci, filter, false); + } + + public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter, boolean scanning) { + return new RestrictedCodepointIterator(ci, filter, scanning); + } + + public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter, boolean scanning, boolean invert) { + return new RestrictedCodepointIterator(ci, filter, scanning, invert); + } + + public CodepointIterator restrict(CodepointFilter filter) { + return restrict(this, filter); + } + + public CodepointIterator restrict(CodepointFilter filter, boolean scanning) { + return restrict(this, filter, scanning); + } + + public CodepointIterator restrict(CodepointFilter filter, boolean scanning, boolean invert) { + return restrict(this, filter, scanning, invert); + } + + /** + * Get the next char. + * @return char + */ + protected abstract char get(); + + /** + * Get the specified char. + * @param index index + * @return char + */ + protected abstract char get(int index); + + /** + * Checks if there are codepoints remaining. + * @return true if there are codepoints remaining + */ + @Override + public boolean hasNext() { + return remaining() > 0; + } + + /** + * Return the final index position. + * @return final index position + */ + public int lastPosition() { + int p = position(); + return (p > -1) ? (p >= limit()) ? p : p - 1 : -1; + } + + /** + * Return the next chars. If the codepoint is not supplemental, the char array will have a single member. If the + * codepoint is supplemental, the char array will have two members, representing the high and low surrogate chars. + * @return next chars + */ + public char[] nextChars(){ + if (hasNext()) { + if (isNextSurrogate()) { + char c1 = get(); + if (CharUtils.isHighSurrogate(c1) && position() < limit()) { + char c2 = get(); + if (CharUtils.isLowSurrogate(c2)) { + return new char[]{c1, c2}; + } else { + throw new InvalidCharacterException(c2); + } + } else if (CharUtils.isLowSurrogate(c1) && position() > 0) { + char c2 = get(position() - 2); + if (CharUtils.isHighSurrogate(c2)) { + return new char[]{c1, c2}; + } else { + throw new InvalidCharacterException(c2); + } + } + } + return new char[]{get()}; + } + return null; + } + + /** + * Peek the next chars in the iterator. If the codepoint is not supplemental, the char array will have a single + * member. If the codepoint is supplemental, the char array will have two members, representing the high and low + * surrogate chars. + * @return chars + */ + public char[] peekChars() { + return peekChars(position()); + } + + /** + * Peek the specified chars in the iterator. If the codepoint is not supplemental, the char array will have a single + * member. If the codepoint is supplemental, the char array will have two members, representing the high and low + * surrogate chars. + * @return chars + */ + private char[] peekChars(int pos) { + if (pos < 0 || pos >= limit()) { + return null; + } + char c1 = get(pos); + if (CharUtils.isHighSurrogate(c1) && pos < limit()) { + char c2 = get(pos + 1); + if (CharUtils.isLowSurrogate(c2)) { + return new char[]{c1, c2}; + } else { + throw new InvalidCharacterException(c2); + } + } else if (CharUtils.isLowSurrogate(c1) && pos > 1) { + char c2 = get(pos - 1); + if (CharUtils.isHighSurrogate(c2)) { + return new char[]{c2, c1}; + } else { + throw new InvalidCharacterException(c2); + } + } else { + return new char[]{c1}; + } + } + + /** + * Return the next codepoint. + * @return code point + */ + @Override + public Codepoint next() { + if (remaining() > 0) { + return toCodepoint(nextChars()); + } else { + throw new NoSuchElementException(); + } + } + + /** + * Peek the next codepoint. + * @return code point + */ + public Codepoint peek() { + return toCodepoint(peekChars()); + } + + /** + * Peek the specified codepoint. + * @param index index + * @return code point + */ + public Codepoint peek(int index) { + return toCodepoint(peekChars(index)); + } + + private Codepoint toCodepoint(char[] chars) { + return (chars == null) ? null : (chars.length == 1) ? new Codepoint(chars[0]) : CharUtils + .toSupplementary(chars[0], chars[1]); + } + + /** + * Set the iterator position. + * @param n iterator position + */ + public void position(int n) { + if (n < 0 || n > limit()) { + throw new ArrayIndexOutOfBoundsException(n); + } + position = n; + } + + /** + * Get the iterator position. + * @return position + */ + public int position() { + return position; + } + + /** + * Return the iterator limit. + * @return limit + */ + public int limit() { + return limit; + } + + /** + * Return the remaining iterator size. + * @return remaining size + */ + public int remaining() { + return limit - position(); + } + + private boolean isNextSurrogate() { + if (!hasNext()) { + return false; + } + char c = get(position()); + return CharUtils.isHighSurrogate(c) || CharUtils.isLowSurrogate(c); + } + + /** + * Returns true if the char at the specified index is a high surrogate. + * @param index index + * @return true if the char at the specified index is a high surrogate + */ + public boolean isHigh(int index) { + if (index < 0 || index > limit()) { + throw new ArrayIndexOutOfBoundsException(index); + } + return CharUtils.isHighSurrogate(get(index)); + } + + /** + * Returns true if the char at the specified index is a low surrogate. + * @param index index + * @return true if the char at the specified index is a low surrogate + */ + public boolean isLow(int index) { + if (index < 0 || index > limit()) { + throw new ArrayIndexOutOfBoundsException(index); + } + return CharUtils.isLowSurrogate(get(index)); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + +} diff --git a/net/src/main/java/org/xbib/net/util/DelegatingCodepointIterator.java b/net/src/main/java/org/xbib/net/util/DelegatingCodepointIterator.java new file mode 100644 index 0000000..528ab5b --- /dev/null +++ b/net/src/main/java/org/xbib/net/util/DelegatingCodepointIterator.java @@ -0,0 +1,92 @@ +package org.xbib.net.util; + +import java.util.NoSuchElementException; + +/** + * Base implementation of a CodepointIterator that filters the output of another CodpointIterator. + */ +public abstract class DelegatingCodepointIterator extends CodepointIterator { + + private final CodepointIterator internal; + + private boolean hasNext; + + protected DelegatingCodepointIterator(CodepointIterator internal) { + this.internal = internal; + } + + @Override + protected char get() { + return internal.get(); + } + + @Override + protected char get(int index) { + return internal.get(index); + } + + @Override + public boolean hasNext() { + hasNext = internal.hasNext(); + return hasNext; + } + + @Override + public boolean isHigh(int index) { + return internal.isHigh(index); + } + + @Override + public boolean isLow(int index) { + return internal.isLow(index); + } + + @Override + public int limit() { + return internal.limit(); + } + + @Override + public Codepoint next() { + if (!hasNext) { + throw new NoSuchElementException(); + } + return internal.next(); + } + + @Override + public char[] nextChars() { + return internal.nextChars(); + } + + @Override + public Codepoint peek() { + return internal.peek(); + } + + @Override + public Codepoint peek(int index) { + return internal.peek(index); + } + + @Override + public char[] peekChars() { + return internal.peekChars(); + } + + @Override + public int position() { + return internal.position(); + } + + @Override + public int remaining() { + return internal.remaining(); + } + + @Override + public void position(int position) { + internal.position(position); + } + +} diff --git a/net/src/main/java/org/xbib/net/util/InvalidCharacterException.java b/net/src/main/java/org/xbib/net/util/InvalidCharacterException.java new file mode 100644 index 0000000..21d2050 --- /dev/null +++ b/net/src/main/java/org/xbib/net/util/InvalidCharacterException.java @@ -0,0 +1,17 @@ +package org.xbib.net.util; + +@SuppressWarnings("serial") +public class InvalidCharacterException extends RuntimeException { + + private final int input; + + public InvalidCharacterException(int input) { + this.input = input; + } + + @Override + public String getMessage() { + return "Invalid Character 0x" + Integer.toHexString(input) + "(" + (char) input + ")"; + } + +} diff --git a/net/src/main/java/org/xbib/net/util/Profile.java b/net/src/main/java/org/xbib/net/util/Profile.java new file mode 100644 index 0000000..54db404 --- /dev/null +++ b/net/src/main/java/org/xbib/net/util/Profile.java @@ -0,0 +1,54 @@ +package org.xbib.net.util; + +/** + * + */ +public enum Profile { + NONE(codepoint -> true), + ALPHA(codepoint -> !CharUtils.isAlpha(codepoint)), + ALPHANUM(codepoint -> !CharUtils.isAlphaDigit(codepoint)), + FRAGMENT(codepoint -> !CharUtils.isFragment(codepoint)), + IFRAGMENT(codepoint -> !CharUtils.isIfragment(codepoint)), + PATH(codepoint -> !CharUtils.isPath(codepoint)), + IPATH(codepoint -> !CharUtils.isIpath(codepoint)), + IUSERINFO(codepoint -> !CharUtils.isIuserinfo(codepoint)), + USERINFO(codepoint -> !CharUtils.isUserInfo(codepoint)), + QUERY(codepoint -> !CharUtils.isQuery(codepoint)), + IQUERY(codepoint -> !CharUtils.isIquery(codepoint)), + SCHEME(codepoint -> !CharUtils.isScheme(codepoint)), + PATHNODELIMS(codepoint -> !CharUtils.isPathNoDelims(codepoint)), + IPATHNODELIMS(codepoint -> !CharUtils.isIpathnodelims(codepoint)), + IPATHNODELIMS_SEG(codepoint -> !CharUtils.isIpathnodelims(codepoint) && codepoint != '@' && codepoint != ':'), + IREGNAME(codepoint -> !CharUtils.isIregname(codepoint)), + IHOST(codepoint -> !CharUtils.isIhost(codepoint)), + IPRIVATE(codepoint -> !CharUtils.isIprivate(codepoint)), + RESERVED(codepoint -> !CharUtils.isReserved(codepoint)), + IUNRESERVED(codepoint -> !CharUtils.isIunreserved(codepoint)), + UNRESERVED(codepoint -> !CharUtils.isUnreserved(codepoint)), + SCHEMESPECIFICPART(codepoint -> !CharUtils.isIunreserved(codepoint) && !CharUtils.isReserved(codepoint) + && !CharUtils.isIprivate(codepoint) + && !CharUtils.isPctEnc(codepoint) + && codepoint != '#'), + AUTHORITY(codepoint -> !CharUtils.isRegname(codepoint) && !CharUtils.isUserInfo(codepoint) && !CharUtils.isGenDelim(codepoint)), + ASCIISANSCRLF(codepoint -> !CharUtils.inRange(codepoint, 1, 9) && !CharUtils.inRange(codepoint, 14, 127)), + PCT(codepoint -> !CharUtils.isPctEnc(codepoint)), + STD3ASCIIRULES(codepoint -> !CharUtils.inRange(codepoint, 0x0000, 0x002C) && + !CharUtils.inRange(codepoint, 0x002E, 0x002F) && + !CharUtils.inRange(codepoint, 0x003A, 0x0040) && + !CharUtils.inRange(codepoint, 0x005B, 0x0060) && + !CharUtils.inRange(codepoint, 0x007B, 0x007F)); + + private final CodepointFilter filter; + + Profile(CodepointFilter filter) { + this.filter = filter; + } + + public CodepointFilter filter() { + return filter; + } + + public boolean check(int codepoint) { + return filter.accept(codepoint); + } +} diff --git a/net/src/main/java/org/xbib/net/util/RestrictedCodepointIterator.java b/net/src/main/java/org/xbib/net/util/RestrictedCodepointIterator.java new file mode 100644 index 0000000..15407e5 --- /dev/null +++ b/net/src/main/java/org/xbib/net/util/RestrictedCodepointIterator.java @@ -0,0 +1,83 @@ +package org.xbib.net.util; + +class RestrictedCodepointIterator extends DelegatingCodepointIterator { + + private final CodepointFilter filter; + private final boolean scanningOnly; + private final boolean notset; + + RestrictedCodepointIterator(CodepointIterator internal, CodepointFilter filter, boolean scanningOnly) { + this(internal, filter, scanningOnly, false); + } + + RestrictedCodepointIterator(CodepointIterator internal, + CodepointFilter filter, + boolean scanningOnly, + boolean notset) { + super(internal); + this.filter = filter; + this.scanningOnly = scanningOnly; + this.notset = notset; + } + + @Override + public boolean hasNext() { + boolean b = super.hasNext(); + if (scanningOnly) { + try { + int cp = super.peek(super.position()).getValue(); + if (b && cp != -1 && check(cp)) { + return false; + } + } catch (InvalidCharacterException e) { + return false; + } + } + return b; + } + + @Override + public Codepoint next() { + Codepoint cp = super.next(); + int v = cp.getValue(); + if (v != -1 && check(v)) { + if (scanningOnly) { + super.position(super.position() - 1); + return null; + } else { + throw new InvalidCharacterException(v); + } + } + return cp; + } + + private boolean check(int cp) { + return notset == !filter.accept(cp); + } + + @Override + public char[] nextChars() { + char[] chars = super.nextChars(); + if (chars != null && chars.length > 0) { + if (chars.length == 1 && check(chars[0])) { + if (scanningOnly) { + super.position(super.position() - 1); + return null; + } else { + throw new InvalidCharacterException(chars[0]); + } + } else if (chars.length == 2) { + int cp = CharUtils.toSupplementary(chars[0], chars[1]).getValue(); + if (check(cp)) { + if (scanningOnly) { + super.position(super.position() - 2); + return null; + } else { + throw new InvalidCharacterException(cp); + } + } + } + } + return chars; + } +} diff --git a/net/src/test/java/org/xbib/net/OtherIRITest.java b/net/src/test/java/org/xbib/net/OtherIRITest.java new file mode 100644 index 0000000..a6b7a69 --- /dev/null +++ b/net/src/test/java/org/xbib/net/OtherIRITest.java @@ -0,0 +1,202 @@ +package org.xbib.net; + +import java.net.URI; +import java.net.URISyntaxException; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +@Disabled +public class OtherIRITest { + + @Test + public void testSimple() throws Exception { + IRI iri = IRI.create("http://validator.w3.org/check?uri=http%3A%2F%2Fr\u00E9sum\u00E9.example.org"); + assertEquals("http", iri.getScheme()); + assertEquals("validator.w3.org", iri.getHost()); + assertEquals("/check", iri.getPath()); + assertEquals("//validator.w3.org/check?uri=http%3A%2F%2Frésumé.example.org", iri.getSchemeSpecificPart()); + + assertEquals("http://validator.w3.org/check?uri=http%3A%2F%2Fr\u00E9sum\u00E9.example.org", iri.toString()); + assertEquals("http://validator.w3.org/check?uri=http%3A%2F%2Fr%C3%A9sum%C3%A9.example.org", iri.toURI().toString()); + } + + @Test + public void testIpv4() throws Exception { + IRI iri = IRI.create("http://127.0.0.1"); + assertEquals("http://127.0.0.1", iri.toURI().toString()); + } + + @Test + public void testIpv6() throws Exception { + IRI iri = IRI.create("http://[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]"); + assertEquals("http://[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]", iri.toURI().toString()); + } + + @Test + public void testUnderscore() throws Exception{ + IRI iri = IRI.create("http://its_gbsc.cn.ibm.com/"); + assertEquals("http://its_gbsc.cn.ibm.com/", iri.toURI().toString()); + } + + @Test + public void testIpv6Invalid() throws URISyntaxException { + IRI iri = IRI.create("http://[2001:0db8:85a3:08d3:1319:8a2e:0370:734o]"); + iri.toURI().toString(); + } + + @Test + public void testFile() throws Exception { + IRI iri = IRI.create("file:///tmp/test/foo"); + assertEquals("file:///tmp/test/foo", iri.toURI().toString()); + } + + @Test + public void testSimple2() throws Exception { + IRI iri = IRI.create("http://www.example.org/red%09ros\u00E9#red"); + assertEquals("http://www.example.org/red%09ros%C3%A9#red", iri.toURI().toString()); + } + + @Test + public void testNotSoSimple() throws Exception { + IRI iri = IRI.create("http://example.com/\uD800\uDF00\uD800\uDF01\uD800\uDF02"); + assertEquals("http://example.com/%F0%90%8C%80%F0%90%8C%81%F0%90%8C%82", iri.toURI().toString()); + } + + @Test + public void testIRItoURI() throws Exception { + IRI iri = IRI.create("http://\u7D0D\u8C46.example.org/%E2%80%AE"); + URI uri = iri.toURI(); + assertEquals("http://xn--99zt52a.example.org/%E2%80%AE", uri.toString()); + } + + @Test + public void testComparison() throws Exception { + IRI iri1 = IRI.create("http://www.example.org/"); + IRI iri2 = IRI.create("http://www.example.org/.."); + IRI iri3 = IRI.create("http://www.Example.org:80"); + + assertFalse(iri1.equals(iri2)); // false + assertFalse(iri1.equals(iri3)); // false + assertFalse(iri2.equals(iri1)); // false + assertFalse(iri2.equals(iri3)); // false + assertFalse(iri3.equals(iri1)); // false + assertFalse(iri3.equals(iri2)); // false + + /*assertTrue(iri1.normalize().equals(iri2.normalize())); + assertTrue(iri1.normalize().equals(iri3.normalize())); + assertTrue(iri2.normalize().equals(iri1.normalize())); + assertTrue(iri2.normalize().equals(iri3.normalize())); + assertTrue(iri3.normalize().equals(iri1.normalize())); + assertTrue(iri3.normalize().equals(iri2.normalize()));*/ + + } + + @Test + public void testUCN() throws Exception { + //IRI iri1 = IRI.create("http://www.example.org/r\u00E9sum\u00E9.html"); + //IRI iri2 = IRI.create("http://www.example.org/re\u0301sume\u0301.html", Normalizer.Form.NFC); + //assertEquals(iri2, iri1); + } + + @Test + public void testPercent() throws Exception { + IRI iri1 = IRI.create("http://example.org/%7e%2Fuser?%2f"); + IRI iri2 = IRI.create("http://example.org/%7E%2fuser?/"); + //assertTrue(iri1.normalize().equals(iri2.normalize())); + } + + @Test + public void testIDN() throws Exception { + IRI iri1 = IRI.create("http://r\u00E9sum\u00E9.example.org"); + assertEquals("xn--rsum-bpad.example.org", iri1.getASCIIHost()); + } + + @Test + public void testRelative() throws Exception { + IRI base = IRI.create("http://example.org/foo/"); + + assertEquals("http://example.org/", base.resolve("/").toString()); + assertEquals("http://example.org/test", base.resolve("/test").toString()); + assertEquals("http://example.org/foo/test", base.resolve("test").toString()); + assertEquals("http://example.org/test", base.resolve("../test").toString()); + assertEquals("http://example.org/foo/test", base.resolve("./test").toString()); + assertEquals("http://example.org/foo/", base.resolve("test/test/../../").toString()); + assertEquals("http://example.org/foo/?test", base.resolve("?test").toString()); + assertEquals("http://example.org/foo/#test", base.resolve("#test").toString()); + assertEquals("http://example.org/foo/", base.resolve(".").toString()); + } + + /** + * Try a variety of URI schemes. If any problematic schemes pop up, we should add a test for 'em here + */ + @Test + public void testSchemes() throws Exception { + + IRI iri = IRI.create("http://a:b@c.org:80/d/e?f#g"); + assertEquals("http", iri.getScheme()); + assertEquals("a:b", iri.getUserInfo()); + assertEquals("c.org", iri.getHost()); + assertEquals(80, iri.getPort()); + assertEquals("/d/e", iri.getPath()); + assertEquals("f", iri.getQuery()); + assertEquals("g", iri.getFragment()); + + iri = IRI.create("https://a:b@c.org:80/d/e?f#g"); + assertEquals("https", iri.getScheme()); + assertEquals("a:b", iri.getUserInfo()); + assertEquals("c.org", iri.getHost()); + assertEquals(80, iri.getPort()); + assertEquals("/d/e", iri.getPath()); + assertEquals("f", iri.getQuery()); + assertEquals("g", iri.getFragment()); + + iri = IRI.create("ftp://a:b@c.org:80/d/e?f#g"); + assertEquals("ftp", iri.getScheme()); + assertEquals("a:b", iri.getUserInfo()); + assertEquals("c.org", iri.getHost()); + assertEquals(80, iri.getPort()); + assertEquals("/d/e", iri.getPath()); + assertEquals("f", iri.getQuery()); + assertEquals("g", iri.getFragment()); + + iri = IRI.create("mailto:joe@example.org?subject=foo"); + assertEquals("mailto", iri.getScheme()); + assertEquals(null, iri.getUserInfo()); + assertEquals(null, iri.getHost()); + assertEquals(-1, iri.getPort()); + assertEquals("joe@example.org", iri.getPath()); + assertEquals("subject=foo", iri.getQuery()); + assertEquals(null, iri.getFragment()); + + iri = IRI.create("tag:example.org,2006:foo"); + assertEquals("tag", iri.getScheme()); + assertEquals(null, iri.getUserInfo()); + assertEquals(null, iri.getHost()); + assertEquals(-1, iri.getPort()); + assertEquals("example.org,2006:foo", iri.getPath()); + assertEquals(null, iri.getQuery()); + assertEquals(null, iri.getFragment()); + + iri = IRI.create("urn:lsid:ibm.com:example:82437234964354895798234d"); + assertEquals("urn", iri.getScheme()); + assertEquals(null, iri.getUserInfo()); + assertEquals(null, iri.getHost()); + assertEquals(-1, iri.getPort()); + assertEquals("lsid:ibm.com:example:82437234964354895798234d", iri.getPath()); + assertEquals(null, iri.getQuery()); + assertEquals(null, iri.getFragment()); + + iri = IRI.create(""); + assertEquals("data", iri.getScheme()); + assertEquals(null, iri.getUserInfo()); + assertEquals(null, iri.getHost()); + assertEquals(-1, iri.getPort()); + assertEquals("image/gif;base64,R0lGODdhMAAwAPAAAAAAAP", iri.getPath()); + assertEquals(null, iri.getQuery()); + assertEquals(null, iri.getFragment()); + + } +} +