From 7d09b26148aaf23a23d0fa0eaeb68bf1dd03c1f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Prante?= Date: Sat, 22 Oct 2022 13:00:27 +0200 Subject: [PATCH] cleaning up IRI implementation --- .../java/org/xbib/content/resource/IRI.java | 2 +- .../content/resource/IRISyntaxException.java | 3 +- .../text/CharArrayCodepointIterator.java | 28 ++++ .../text/CharSequenceCodepointIterator.java | 25 +++ .../xbib/content/resource/text/CharUtils.java | 64 +------- .../{Filter.java => CodepointFilter.java} | 4 +- .../resource/text/CodepointIterator.java | 145 +----------------- .../text/InvalidCharacterException.java | 5 +- .../xbib/content/resource/text/Profile.java | 54 +++++++ .../text/RestrictedCodepointIterator.java | 83 ++++++++++ .../org/xbib/content/resource/TestIRI.java | 18 +++ 11 files changed, 225 insertions(+), 206 deletions(-) create mode 100644 content-resource/src/main/java/org/xbib/content/resource/text/CharArrayCodepointIterator.java create mode 100644 content-resource/src/main/java/org/xbib/content/resource/text/CharSequenceCodepointIterator.java rename content-resource/src/main/java/org/xbib/content/resource/text/{Filter.java => CodepointFilter.java} (70%) create mode 100644 content-resource/src/main/java/org/xbib/content/resource/text/Profile.java create mode 100644 content-resource/src/main/java/org/xbib/content/resource/text/RestrictedCodepointIterator.java create mode 100644 content-resource/src/test/java/org/xbib/content/resource/TestIRI.java diff --git a/content-resource/src/main/java/org/xbib/content/resource/IRI.java b/content-resource/src/main/java/org/xbib/content/resource/IRI.java index b63bfcb..2368197 100644 --- a/content-resource/src/main/java/org/xbib/content/resource/IRI.java +++ b/content-resource/src/main/java/org/xbib/content/resource/IRI.java @@ -1,7 +1,7 @@ package org.xbib.content.resource; import org.xbib.content.resource.text.CharUtils; -import org.xbib.content.resource.text.CharUtils.Profile; +import org.xbib.content.resource.text.Profile; import org.xbib.content.resource.text.InvalidCharacterException; import org.xbib.net.PercentDecoder; import org.xbib.net.PercentEncoders; diff --git a/content-resource/src/main/java/org/xbib/content/resource/IRISyntaxException.java b/content-resource/src/main/java/org/xbib/content/resource/IRISyntaxException.java index 7d53809..cc38aeb 100644 --- a/content-resource/src/main/java/org/xbib/content/resource/IRISyntaxException.java +++ b/content-resource/src/main/java/org/xbib/content/resource/IRISyntaxException.java @@ -3,10 +3,9 @@ package org.xbib.content.resource; /** * */ +@SuppressWarnings("serial") public class IRISyntaxException extends RuntimeException { - private static final long serialVersionUID = 1813084470937980392L; - IRISyntaxException(String message) { super(message); } diff --git a/content-resource/src/main/java/org/xbib/content/resource/text/CharArrayCodepointIterator.java b/content-resource/src/main/java/org/xbib/content/resource/text/CharArrayCodepointIterator.java new file mode 100644 index 0000000..ff1cfab --- /dev/null +++ b/content-resource/src/main/java/org/xbib/content/resource/text/CharArrayCodepointIterator.java @@ -0,0 +1,28 @@ +package org.xbib.content.resource.text; + +class CharArrayCodepointIterator extends CodepointIterator { + protected char[] buffer; + + CharArrayCodepointIterator(char[] buffer) { + this(buffer, 0, buffer.length); + } + + CharArrayCodepointIterator(char[] buffer, int n, int e) { + this.buffer = buffer; + this.position = n; + this.limit = Math.min(buffer.length - n, e); + } + + @Override + protected char get() { + return (position < limit) ? buffer[position++] : (char) -1; + } + + @Override + protected char get(int index) { + if (index < 0 || index >= limit) { + throw new ArrayIndexOutOfBoundsException(index); + } + return buffer[index]; + } +} diff --git a/content-resource/src/main/java/org/xbib/content/resource/text/CharSequenceCodepointIterator.java b/content-resource/src/main/java/org/xbib/content/resource/text/CharSequenceCodepointIterator.java new file mode 100644 index 0000000..b35f283 --- /dev/null +++ b/content-resource/src/main/java/org/xbib/content/resource/text/CharSequenceCodepointIterator.java @@ -0,0 +1,25 @@ +package org.xbib.content.resource.text; + +class CharSequenceCodepointIterator extends CodepointIterator { + private final CharSequence buffer; + + CharSequenceCodepointIterator(CharSequence buffer) { + this(buffer, 0, buffer.length()); + } + + CharSequenceCodepointIterator(CharSequence buffer, int n, int e) { + this.buffer = buffer; + this.position = n; + this.limit = Math.min(buffer.length() - n, e); + } + + @Override + protected char get() { + return buffer.charAt(position++); + } + + @Override + protected char get(int index) { + return buffer.charAt(index); + } +} diff --git a/content-resource/src/main/java/org/xbib/content/resource/text/CharUtils.java b/content-resource/src/main/java/org/xbib/content/resource/text/CharUtils.java index 79e2332..5f83e53 100644 --- a/content-resource/src/main/java/org/xbib/content/resource/text/CharUtils.java +++ b/content-resource/src/main/java/org/xbib/content/resource/text/CharUtils.java @@ -6,12 +6,12 @@ package org.xbib.content.resource.text; public final class CharUtils { public static final char LRE = 0x202A; - private static final char RLE = 0x202B; - private static final char LRO = 0x202D; - private static final char RLO = 0x202E; - private static final char LRM = 0x200E; - private static final char RLM = 0x200F; - private static final char PDF = 0x202C; + public static final char RLE = 0x202B; + public static final char LRO = 0x202D; + public static final char RLO = 0x202E; + public static final char LRM = 0x200E; + public static final char RLM = 0x200F; + public static final char PDF = 0x202C; private CharUtils() { } @@ -594,56 +594,4 @@ public final class CharUtils { verify(CodepointIterator.forCharSequence(s), profile); } - /** - * - */ - public enum Profile { - NONE(codepoint -> true), - ALPHA(codepoint -> !isAlpha(codepoint)), - ALPHANUM(codepoint -> !isAlphaDigit(codepoint)), - FRAGMENT(codepoint -> !isFragment(codepoint)), - IFRAGMENT(codepoint -> !isIfragment(codepoint)), - PATH(codepoint -> !isPath(codepoint)), - IPATH(codepoint -> !isIpath(codepoint)), - IUSERINFO(codepoint -> !isIuserinfo(codepoint)), - USERINFO(codepoint -> !isUserInfo(codepoint)), - QUERY(codepoint -> !isQuery(codepoint)), - IQUERY(codepoint -> !isIquery(codepoint)), - SCHEME(codepoint -> !isScheme(codepoint)), - PATHNODELIMS(codepoint -> !isPathNoDelims(codepoint)), - IPATHNODELIMS(codepoint -> !isIpathnodelims(codepoint)), - IPATHNODELIMS_SEG(codepoint -> !isIpathnodelims(codepoint) && codepoint != '@' && codepoint != ':'), - IREGNAME(codepoint -> !isIregname(codepoint)), - IHOST(codepoint -> !isIhost(codepoint)), - IPRIVATE(codepoint -> !isIprivate(codepoint)), - RESERVED(codepoint -> !isReserved(codepoint)), - IUNRESERVED(codepoint -> !isIunreserved(codepoint)), - UNRESERVED(codepoint -> !isUnreserved(codepoint)), - SCHEMESPECIFICPART(codepoint -> !isIunreserved(codepoint) && !isReserved(codepoint) - && !isIprivate(codepoint) - && !isPctEnc(codepoint) - && codepoint != '#'), - AUTHORITY(codepoint -> !isRegname(codepoint) && !isUserInfo(codepoint) && !isGenDelim(codepoint)), - ASCIISANSCRLF(codepoint -> !inRange(codepoint, 1, 9) && !inRange(codepoint, 14, 127)), - PCT(codepoint -> !isPctEnc(codepoint)), - STD3ASCIIRULES(codepoint -> !inRange(codepoint, 0x0000, 0x002C) && - !inRange(codepoint, 0x002E, 0x002F) && - !inRange(codepoint, 0x003A, 0x0040) && - !inRange(codepoint, 0x005B, 0x0060) && - !inRange(codepoint, 0x007B, 0x007F)); - private final Filter filter; - - Profile(Filter filter) { - this.filter = filter; - } - - public Filter filter() { - return filter; - } - - public boolean check(int codepoint) { - return filter.accept(codepoint); - } - } - } diff --git a/content-resource/src/main/java/org/xbib/content/resource/text/Filter.java b/content-resource/src/main/java/org/xbib/content/resource/text/CodepointFilter.java similarity index 70% rename from content-resource/src/main/java/org/xbib/content/resource/text/Filter.java rename to content-resource/src/main/java/org/xbib/content/resource/text/CodepointFilter.java index febf6f1..a53accd 100644 --- a/content-resource/src/main/java/org/xbib/content/resource/text/Filter.java +++ b/content-resource/src/main/java/org/xbib/content/resource/text/CodepointFilter.java @@ -4,7 +4,7 @@ package org.xbib.content.resource.text; * Filters are used in a variety of ways to filter or verify unicode codepoints. */ @FunctionalInterface -public interface Filter { +public interface CodepointFilter { - boolean accept(int c); + boolean accept(int ch); } diff --git a/content-resource/src/main/java/org/xbib/content/resource/text/CodepointIterator.java b/content-resource/src/main/java/org/xbib/content/resource/text/CodepointIterator.java index 848e004..0d8ef1e 100644 --- a/content-resource/src/main/java/org/xbib/content/resource/text/CodepointIterator.java +++ b/content-resource/src/main/java/org/xbib/content/resource/text/CodepointIterator.java @@ -33,28 +33,27 @@ public abstract class CodepointIterator implements Iterator { return new CharSequenceCodepointIterator(seq); } - - public static CodepointIterator restrict(CodepointIterator ci, Filter filter) { + public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter) { return new RestrictedCodepointIterator(ci, filter, false); } - public static CodepointIterator restrict(CodepointIterator ci, Filter filter, boolean scanning) { + public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter, boolean scanning) { return new RestrictedCodepointIterator(ci, filter, scanning); } - public static CodepointIterator restrict(CodepointIterator ci, Filter filter, boolean scanning, boolean invert) { + public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter, boolean scanning, boolean invert) { return new RestrictedCodepointIterator(ci, filter, scanning, invert); } - public CodepointIterator restrict(Filter filter) { + public CodepointIterator restrict(CodepointFilter filter) { return restrict(this, filter); } - public CodepointIterator restrict(Filter filter, boolean scanning) { + public CodepointIterator restrict(CodepointFilter filter, boolean scanning) { return restrict(this, filter, scanning); } - public CodepointIterator restrict(Filter filter, boolean scanning, boolean invert) { + public CodepointIterator restrict(CodepointFilter filter, boolean scanning, boolean invert) { return restrict(this, filter, scanning, invert); } @@ -266,136 +265,4 @@ public abstract class CodepointIterator implements Iterator { throw new UnsupportedOperationException(); } - private static class CharArrayCodepointIterator extends CodepointIterator { - protected char[] buffer; - - CharArrayCodepointIterator(char[] buffer) { - this(buffer, 0, buffer.length); - } - - CharArrayCodepointIterator(char[] buffer, int n, int e) { - this.buffer = buffer; - this.position = n; - this.limit = Math.min(buffer.length - n, e); - } - - @Override - protected char get() { - return (position < limit) ? buffer[position++] : (char) -1; - } - - @Override - protected char get(int index) { - if (index < 0 || index >= limit) { - throw new ArrayIndexOutOfBoundsException(index); - } - return buffer[index]; - } - } - - private static class CharSequenceCodepointIterator extends CodepointIterator { - private CharSequence buffer; - - CharSequenceCodepointIterator(CharSequence buffer) { - this(buffer, 0, buffer.length()); - } - - CharSequenceCodepointIterator(CharSequence buffer, int n, int e) { - this.buffer = buffer; - this.position = n; - this.limit = Math.min(buffer.length() - n, e); - } - - @Override - protected char get() { - return buffer.charAt(position++); - } - - @Override - protected char get(int index) { - return buffer.charAt(index); - } - } - - private static class RestrictedCodepointIterator extends DelegatingCodepointIterator { - - private final Filter filter; - private final boolean scanningOnly; - private final boolean notset; - - RestrictedCodepointIterator(CodepointIterator internal, Filter filter, boolean scanningOnly) { - this(internal, filter, scanningOnly, false); - } - - RestrictedCodepointIterator(CodepointIterator internal, - Filter filter, - boolean scanningOnly, - boolean notset) { - super(internal); - this.filter = filter; - this.scanningOnly = scanningOnly; - this.notset = notset; - } - - @Override - public boolean hasNext() { - boolean b = super.hasNext(); - if (scanningOnly) { - try { - int cp = super.peek(super.position()).getValue(); - if (b && cp != -1 && check(cp)) { - return false; - } - } catch (InvalidCharacterException e) { - return false; - } - } - return b; - } - - @Override - public Codepoint next() { - Codepoint cp = super.next(); - int v = cp.getValue(); - if (v != -1 && check(v)) { - if (scanningOnly) { - super.position(super.position() - 1); - return null; - } else { - throw new InvalidCharacterException(v); - } - } - return cp; - } - - private boolean check(int cp) { - return notset == !filter.accept(cp); - } - - @Override - public char[] nextChars() { - char[] chars = super.nextChars(); - if (chars != null && chars.length > 0) { - if (chars.length == 1 && check(chars[0])) { - if (scanningOnly) { - super.position(super.position() - 1); - return null; - } else { - throw new InvalidCharacterException(chars[0]); - } - } else if (chars.length == 2) { - int cp = CharUtils.toSupplementary(chars[0], chars[1]).getValue(); - if (check(cp)) { - if (scanningOnly) { - super.position(super.position() - 2); - return null; - } else { - throw new InvalidCharacterException(cp); - } - } - } - } - return chars; - } - } } diff --git a/content-resource/src/main/java/org/xbib/content/resource/text/InvalidCharacterException.java b/content-resource/src/main/java/org/xbib/content/resource/text/InvalidCharacterException.java index da2f940..89a9d9a 100644 --- a/content-resource/src/main/java/org/xbib/content/resource/text/InvalidCharacterException.java +++ b/content-resource/src/main/java/org/xbib/content/resource/text/InvalidCharacterException.java @@ -1,11 +1,8 @@ package org.xbib.content.resource.text; -/** - * - */ +@SuppressWarnings("serial") public class InvalidCharacterException extends RuntimeException { - private static final long serialVersionUID = -3037013255350562940L; private final int input; public InvalidCharacterException(int input) { diff --git a/content-resource/src/main/java/org/xbib/content/resource/text/Profile.java b/content-resource/src/main/java/org/xbib/content/resource/text/Profile.java new file mode 100644 index 0000000..96cd6f9 --- /dev/null +++ b/content-resource/src/main/java/org/xbib/content/resource/text/Profile.java @@ -0,0 +1,54 @@ +package org.xbib.content.resource.text; + +/** + * + */ +public enum Profile { + NONE(codepoint -> true), + ALPHA(codepoint -> !CharUtils.isAlpha(codepoint)), + ALPHANUM(codepoint -> !CharUtils.isAlphaDigit(codepoint)), + FRAGMENT(codepoint -> !CharUtils.isFragment(codepoint)), + IFRAGMENT(codepoint -> !CharUtils.isIfragment(codepoint)), + PATH(codepoint -> !CharUtils.isPath(codepoint)), + IPATH(codepoint -> !CharUtils.isIpath(codepoint)), + IUSERINFO(codepoint -> !CharUtils.isIuserinfo(codepoint)), + USERINFO(codepoint -> !CharUtils.isUserInfo(codepoint)), + QUERY(codepoint -> !CharUtils.isQuery(codepoint)), + IQUERY(codepoint -> !CharUtils.isIquery(codepoint)), + SCHEME(codepoint -> !CharUtils.isScheme(codepoint)), + PATHNODELIMS(codepoint -> !CharUtils.isPathNoDelims(codepoint)), + IPATHNODELIMS(codepoint -> !CharUtils.isIpathnodelims(codepoint)), + IPATHNODELIMS_SEG(codepoint -> !CharUtils.isIpathnodelims(codepoint) && codepoint != '@' && codepoint != ':'), + IREGNAME(codepoint -> !CharUtils.isIregname(codepoint)), + IHOST(codepoint -> !CharUtils.isIhost(codepoint)), + IPRIVATE(codepoint -> !CharUtils.isIprivate(codepoint)), + RESERVED(codepoint -> !CharUtils.isReserved(codepoint)), + IUNRESERVED(codepoint -> !CharUtils.isIunreserved(codepoint)), + UNRESERVED(codepoint -> !CharUtils.isUnreserved(codepoint)), + SCHEMESPECIFICPART(codepoint -> !CharUtils.isIunreserved(codepoint) && !CharUtils.isReserved(codepoint) + && !CharUtils.isIprivate(codepoint) + && !CharUtils.isPctEnc(codepoint) + && codepoint != '#'), + AUTHORITY(codepoint -> !CharUtils.isRegname(codepoint) && !CharUtils.isUserInfo(codepoint) && !CharUtils.isGenDelim(codepoint)), + ASCIISANSCRLF(codepoint -> !CharUtils.inRange(codepoint, 1, 9) && !CharUtils.inRange(codepoint, 14, 127)), + PCT(codepoint -> !CharUtils.isPctEnc(codepoint)), + STD3ASCIIRULES(codepoint -> !CharUtils.inRange(codepoint, 0x0000, 0x002C) && + !CharUtils.inRange(codepoint, 0x002E, 0x002F) && + !CharUtils.inRange(codepoint, 0x003A, 0x0040) && + !CharUtils.inRange(codepoint, 0x005B, 0x0060) && + !CharUtils.inRange(codepoint, 0x007B, 0x007F)); + + private final CodepointFilter filter; + + Profile(CodepointFilter filter) { + this.filter = filter; + } + + public CodepointFilter filter() { + return filter; + } + + public boolean check(int codepoint) { + return filter.accept(codepoint); + } +} diff --git a/content-resource/src/main/java/org/xbib/content/resource/text/RestrictedCodepointIterator.java b/content-resource/src/main/java/org/xbib/content/resource/text/RestrictedCodepointIterator.java new file mode 100644 index 0000000..d0cb3ae --- /dev/null +++ b/content-resource/src/main/java/org/xbib/content/resource/text/RestrictedCodepointIterator.java @@ -0,0 +1,83 @@ +package org.xbib.content.resource.text; + +class RestrictedCodepointIterator extends DelegatingCodepointIterator { + + private final CodepointFilter filter; + private final boolean scanningOnly; + private final boolean notset; + + RestrictedCodepointIterator(CodepointIterator internal, CodepointFilter filter, boolean scanningOnly) { + this(internal, filter, scanningOnly, false); + } + + RestrictedCodepointIterator(CodepointIterator internal, + CodepointFilter filter, + boolean scanningOnly, + boolean notset) { + super(internal); + this.filter = filter; + this.scanningOnly = scanningOnly; + this.notset = notset; + } + + @Override + public boolean hasNext() { + boolean b = super.hasNext(); + if (scanningOnly) { + try { + int cp = super.peek(super.position()).getValue(); + if (b && cp != -1 && check(cp)) { + return false; + } + } catch (InvalidCharacterException e) { + return false; + } + } + return b; + } + + @Override + public Codepoint next() { + Codepoint cp = super.next(); + int v = cp.getValue(); + if (v != -1 && check(v)) { + if (scanningOnly) { + super.position(super.position() - 1); + return null; + } else { + throw new InvalidCharacterException(v); + } + } + return cp; + } + + private boolean check(int cp) { + return notset == !filter.accept(cp); + } + + @Override + public char[] nextChars() { + char[] chars = super.nextChars(); + if (chars != null && chars.length > 0) { + if (chars.length == 1 && check(chars[0])) { + if (scanningOnly) { + super.position(super.position() - 1); + return null; + } else { + throw new InvalidCharacterException(chars[0]); + } + } else if (chars.length == 2) { + int cp = CharUtils.toSupplementary(chars[0], chars[1]).getValue(); + if (check(cp)) { + if (scanningOnly) { + super.position(super.position() - 2); + return null; + } else { + throw new InvalidCharacterException(cp); + } + } + } + } + return chars; + } +} diff --git a/content-resource/src/test/java/org/xbib/content/resource/TestIRI.java b/content-resource/src/test/java/org/xbib/content/resource/TestIRI.java new file mode 100644 index 0000000..fe86bb1 --- /dev/null +++ b/content-resource/src/test/java/org/xbib/content/resource/TestIRI.java @@ -0,0 +1,18 @@ +package org.xbib.content.resource; + +import java.net.URISyntaxException; +import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestIRI { + + @Test + public void testSimple() throws URISyntaxException { + IRI iri = IRI.create("http://validator.w3.org/check?uri=http%3A%2F%2Fr\u00E9sum\u00E9.example.org"); + assertEquals("http", iri.getScheme()); + assertEquals("validator.w3.org", iri.getHost()); + assertEquals("/check", iri.getPath()); + assertEquals("//validator.w3.org/check?uri=http%3A%2F%2Frésumé.example.org", iri.getSchemeSpecificPart()); + //assertEquals("http://validator.w3.org/check?uri=http%3A%2F%2Fr%C3%A9sum%C3%A9.example.org", iri.toURI().toString()); + } +}