cleaning up IRI implementation

This commit is contained in:
Jörg Prante 2022-10-22 13:00:27 +02:00
parent 3fa5cb456c
commit 7d09b26148
11 changed files with 225 additions and 206 deletions

View file

@ -1,7 +1,7 @@
package org.xbib.content.resource;
import org.xbib.content.resource.text.CharUtils;
import org.xbib.content.resource.text.CharUtils.Profile;
import org.xbib.content.resource.text.Profile;
import org.xbib.content.resource.text.InvalidCharacterException;
import org.xbib.net.PercentDecoder;
import org.xbib.net.PercentEncoders;

View file

@ -3,10 +3,9 @@ package org.xbib.content.resource;
/**
*
*/
@SuppressWarnings("serial")
public class IRISyntaxException extends RuntimeException {
private static final long serialVersionUID = 1813084470937980392L;
IRISyntaxException(String message) {
super(message);
}

View file

@ -0,0 +1,28 @@
package org.xbib.content.resource.text;
class CharArrayCodepointIterator extends CodepointIterator {
protected char[] buffer;
CharArrayCodepointIterator(char[] buffer) {
this(buffer, 0, buffer.length);
}
CharArrayCodepointIterator(char[] buffer, int n, int e) {
this.buffer = buffer;
this.position = n;
this.limit = Math.min(buffer.length - n, e);
}
@Override
protected char get() {
return (position < limit) ? buffer[position++] : (char) -1;
}
@Override
protected char get(int index) {
if (index < 0 || index >= limit) {
throw new ArrayIndexOutOfBoundsException(index);
}
return buffer[index];
}
}

View file

@ -0,0 +1,25 @@
package org.xbib.content.resource.text;
class CharSequenceCodepointIterator extends CodepointIterator {
private final CharSequence buffer;
CharSequenceCodepointIterator(CharSequence buffer) {
this(buffer, 0, buffer.length());
}
CharSequenceCodepointIterator(CharSequence buffer, int n, int e) {
this.buffer = buffer;
this.position = n;
this.limit = Math.min(buffer.length() - n, e);
}
@Override
protected char get() {
return buffer.charAt(position++);
}
@Override
protected char get(int index) {
return buffer.charAt(index);
}
}

View file

@ -6,12 +6,12 @@ package org.xbib.content.resource.text;
public final class CharUtils {
public static final char LRE = 0x202A;
private static final char RLE = 0x202B;
private static final char LRO = 0x202D;
private static final char RLO = 0x202E;
private static final char LRM = 0x200E;
private static final char RLM = 0x200F;
private static final char PDF = 0x202C;
public static final char RLE = 0x202B;
public static final char LRO = 0x202D;
public static final char RLO = 0x202E;
public static final char LRM = 0x200E;
public static final char RLM = 0x200F;
public static final char PDF = 0x202C;
private CharUtils() {
}
@ -594,56 +594,4 @@ public final class CharUtils {
verify(CodepointIterator.forCharSequence(s), profile);
}
/**
*
*/
public enum Profile {
NONE(codepoint -> true),
ALPHA(codepoint -> !isAlpha(codepoint)),
ALPHANUM(codepoint -> !isAlphaDigit(codepoint)),
FRAGMENT(codepoint -> !isFragment(codepoint)),
IFRAGMENT(codepoint -> !isIfragment(codepoint)),
PATH(codepoint -> !isPath(codepoint)),
IPATH(codepoint -> !isIpath(codepoint)),
IUSERINFO(codepoint -> !isIuserinfo(codepoint)),
USERINFO(codepoint -> !isUserInfo(codepoint)),
QUERY(codepoint -> !isQuery(codepoint)),
IQUERY(codepoint -> !isIquery(codepoint)),
SCHEME(codepoint -> !isScheme(codepoint)),
PATHNODELIMS(codepoint -> !isPathNoDelims(codepoint)),
IPATHNODELIMS(codepoint -> !isIpathnodelims(codepoint)),
IPATHNODELIMS_SEG(codepoint -> !isIpathnodelims(codepoint) && codepoint != '@' && codepoint != ':'),
IREGNAME(codepoint -> !isIregname(codepoint)),
IHOST(codepoint -> !isIhost(codepoint)),
IPRIVATE(codepoint -> !isIprivate(codepoint)),
RESERVED(codepoint -> !isReserved(codepoint)),
IUNRESERVED(codepoint -> !isIunreserved(codepoint)),
UNRESERVED(codepoint -> !isUnreserved(codepoint)),
SCHEMESPECIFICPART(codepoint -> !isIunreserved(codepoint) && !isReserved(codepoint)
&& !isIprivate(codepoint)
&& !isPctEnc(codepoint)
&& codepoint != '#'),
AUTHORITY(codepoint -> !isRegname(codepoint) && !isUserInfo(codepoint) && !isGenDelim(codepoint)),
ASCIISANSCRLF(codepoint -> !inRange(codepoint, 1, 9) && !inRange(codepoint, 14, 127)),
PCT(codepoint -> !isPctEnc(codepoint)),
STD3ASCIIRULES(codepoint -> !inRange(codepoint, 0x0000, 0x002C) &&
!inRange(codepoint, 0x002E, 0x002F) &&
!inRange(codepoint, 0x003A, 0x0040) &&
!inRange(codepoint, 0x005B, 0x0060) &&
!inRange(codepoint, 0x007B, 0x007F));
private final Filter filter;
Profile(Filter filter) {
this.filter = filter;
}
public Filter filter() {
return filter;
}
public boolean check(int codepoint) {
return filter.accept(codepoint);
}
}
}

View file

@ -4,7 +4,7 @@ package org.xbib.content.resource.text;
* Filters are used in a variety of ways to filter or verify unicode codepoints.
*/
@FunctionalInterface
public interface Filter {
public interface CodepointFilter {
boolean accept(int c);
boolean accept(int ch);
}

View file

@ -33,28 +33,27 @@ public abstract class CodepointIterator implements Iterator<Codepoint> {
return new CharSequenceCodepointIterator(seq);
}
public static CodepointIterator restrict(CodepointIterator ci, Filter filter) {
public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter) {
return new RestrictedCodepointIterator(ci, filter, false);
}
public static CodepointIterator restrict(CodepointIterator ci, Filter filter, boolean scanning) {
public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter, boolean scanning) {
return new RestrictedCodepointIterator(ci, filter, scanning);
}
public static CodepointIterator restrict(CodepointIterator ci, Filter filter, boolean scanning, boolean invert) {
public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter, boolean scanning, boolean invert) {
return new RestrictedCodepointIterator(ci, filter, scanning, invert);
}
public CodepointIterator restrict(Filter filter) {
public CodepointIterator restrict(CodepointFilter filter) {
return restrict(this, filter);
}
public CodepointIterator restrict(Filter filter, boolean scanning) {
public CodepointIterator restrict(CodepointFilter filter, boolean scanning) {
return restrict(this, filter, scanning);
}
public CodepointIterator restrict(Filter filter, boolean scanning, boolean invert) {
public CodepointIterator restrict(CodepointFilter filter, boolean scanning, boolean invert) {
return restrict(this, filter, scanning, invert);
}
@ -266,136 +265,4 @@ public abstract class CodepointIterator implements Iterator<Codepoint> {
throw new UnsupportedOperationException();
}
private static class CharArrayCodepointIterator extends CodepointIterator {
protected char[] buffer;
CharArrayCodepointIterator(char[] buffer) {
this(buffer, 0, buffer.length);
}
CharArrayCodepointIterator(char[] buffer, int n, int e) {
this.buffer = buffer;
this.position = n;
this.limit = Math.min(buffer.length - n, e);
}
@Override
protected char get() {
return (position < limit) ? buffer[position++] : (char) -1;
}
@Override
protected char get(int index) {
if (index < 0 || index >= limit) {
throw new ArrayIndexOutOfBoundsException(index);
}
return buffer[index];
}
}
private static class CharSequenceCodepointIterator extends CodepointIterator {
private CharSequence buffer;
CharSequenceCodepointIterator(CharSequence buffer) {
this(buffer, 0, buffer.length());
}
CharSequenceCodepointIterator(CharSequence buffer, int n, int e) {
this.buffer = buffer;
this.position = n;
this.limit = Math.min(buffer.length() - n, e);
}
@Override
protected char get() {
return buffer.charAt(position++);
}
@Override
protected char get(int index) {
return buffer.charAt(index);
}
}
private static class RestrictedCodepointIterator extends DelegatingCodepointIterator {
private final Filter filter;
private final boolean scanningOnly;
private final boolean notset;
RestrictedCodepointIterator(CodepointIterator internal, Filter filter, boolean scanningOnly) {
this(internal, filter, scanningOnly, false);
}
RestrictedCodepointIterator(CodepointIterator internal,
Filter filter,
boolean scanningOnly,
boolean notset) {
super(internal);
this.filter = filter;
this.scanningOnly = scanningOnly;
this.notset = notset;
}
@Override
public boolean hasNext() {
boolean b = super.hasNext();
if (scanningOnly) {
try {
int cp = super.peek(super.position()).getValue();
if (b && cp != -1 && check(cp)) {
return false;
}
} catch (InvalidCharacterException e) {
return false;
}
}
return b;
}
@Override
public Codepoint next() {
Codepoint cp = super.next();
int v = cp.getValue();
if (v != -1 && check(v)) {
if (scanningOnly) {
super.position(super.position() - 1);
return null;
} else {
throw new InvalidCharacterException(v);
}
}
return cp;
}
private boolean check(int cp) {
return notset == !filter.accept(cp);
}
@Override
public char[] nextChars() {
char[] chars = super.nextChars();
if (chars != null && chars.length > 0) {
if (chars.length == 1 && check(chars[0])) {
if (scanningOnly) {
super.position(super.position() - 1);
return null;
} else {
throw new InvalidCharacterException(chars[0]);
}
} else if (chars.length == 2) {
int cp = CharUtils.toSupplementary(chars[0], chars[1]).getValue();
if (check(cp)) {
if (scanningOnly) {
super.position(super.position() - 2);
return null;
} else {
throw new InvalidCharacterException(cp);
}
}
}
}
return chars;
}
}
}

View file

@ -1,11 +1,8 @@
package org.xbib.content.resource.text;
/**
*
*/
@SuppressWarnings("serial")
public class InvalidCharacterException extends RuntimeException {
private static final long serialVersionUID = -3037013255350562940L;
private final int input;
public InvalidCharacterException(int input) {

View file

@ -0,0 +1,54 @@
package org.xbib.content.resource.text;
/**
*
*/
public enum Profile {
NONE(codepoint -> true),
ALPHA(codepoint -> !CharUtils.isAlpha(codepoint)),
ALPHANUM(codepoint -> !CharUtils.isAlphaDigit(codepoint)),
FRAGMENT(codepoint -> !CharUtils.isFragment(codepoint)),
IFRAGMENT(codepoint -> !CharUtils.isIfragment(codepoint)),
PATH(codepoint -> !CharUtils.isPath(codepoint)),
IPATH(codepoint -> !CharUtils.isIpath(codepoint)),
IUSERINFO(codepoint -> !CharUtils.isIuserinfo(codepoint)),
USERINFO(codepoint -> !CharUtils.isUserInfo(codepoint)),
QUERY(codepoint -> !CharUtils.isQuery(codepoint)),
IQUERY(codepoint -> !CharUtils.isIquery(codepoint)),
SCHEME(codepoint -> !CharUtils.isScheme(codepoint)),
PATHNODELIMS(codepoint -> !CharUtils.isPathNoDelims(codepoint)),
IPATHNODELIMS(codepoint -> !CharUtils.isIpathnodelims(codepoint)),
IPATHNODELIMS_SEG(codepoint -> !CharUtils.isIpathnodelims(codepoint) && codepoint != '@' && codepoint != ':'),
IREGNAME(codepoint -> !CharUtils.isIregname(codepoint)),
IHOST(codepoint -> !CharUtils.isIhost(codepoint)),
IPRIVATE(codepoint -> !CharUtils.isIprivate(codepoint)),
RESERVED(codepoint -> !CharUtils.isReserved(codepoint)),
IUNRESERVED(codepoint -> !CharUtils.isIunreserved(codepoint)),
UNRESERVED(codepoint -> !CharUtils.isUnreserved(codepoint)),
SCHEMESPECIFICPART(codepoint -> !CharUtils.isIunreserved(codepoint) && !CharUtils.isReserved(codepoint)
&& !CharUtils.isIprivate(codepoint)
&& !CharUtils.isPctEnc(codepoint)
&& codepoint != '#'),
AUTHORITY(codepoint -> !CharUtils.isRegname(codepoint) && !CharUtils.isUserInfo(codepoint) && !CharUtils.isGenDelim(codepoint)),
ASCIISANSCRLF(codepoint -> !CharUtils.inRange(codepoint, 1, 9) && !CharUtils.inRange(codepoint, 14, 127)),
PCT(codepoint -> !CharUtils.isPctEnc(codepoint)),
STD3ASCIIRULES(codepoint -> !CharUtils.inRange(codepoint, 0x0000, 0x002C) &&
!CharUtils.inRange(codepoint, 0x002E, 0x002F) &&
!CharUtils.inRange(codepoint, 0x003A, 0x0040) &&
!CharUtils.inRange(codepoint, 0x005B, 0x0060) &&
!CharUtils.inRange(codepoint, 0x007B, 0x007F));
private final CodepointFilter filter;
Profile(CodepointFilter filter) {
this.filter = filter;
}
public CodepointFilter filter() {
return filter;
}
public boolean check(int codepoint) {
return filter.accept(codepoint);
}
}

View file

@ -0,0 +1,83 @@
package org.xbib.content.resource.text;
class RestrictedCodepointIterator extends DelegatingCodepointIterator {
private final CodepointFilter filter;
private final boolean scanningOnly;
private final boolean notset;
RestrictedCodepointIterator(CodepointIterator internal, CodepointFilter filter, boolean scanningOnly) {
this(internal, filter, scanningOnly, false);
}
RestrictedCodepointIterator(CodepointIterator internal,
CodepointFilter filter,
boolean scanningOnly,
boolean notset) {
super(internal);
this.filter = filter;
this.scanningOnly = scanningOnly;
this.notset = notset;
}
@Override
public boolean hasNext() {
boolean b = super.hasNext();
if (scanningOnly) {
try {
int cp = super.peek(super.position()).getValue();
if (b && cp != -1 && check(cp)) {
return false;
}
} catch (InvalidCharacterException e) {
return false;
}
}
return b;
}
@Override
public Codepoint next() {
Codepoint cp = super.next();
int v = cp.getValue();
if (v != -1 && check(v)) {
if (scanningOnly) {
super.position(super.position() - 1);
return null;
} else {
throw new InvalidCharacterException(v);
}
}
return cp;
}
private boolean check(int cp) {
return notset == !filter.accept(cp);
}
@Override
public char[] nextChars() {
char[] chars = super.nextChars();
if (chars != null && chars.length > 0) {
if (chars.length == 1 && check(chars[0])) {
if (scanningOnly) {
super.position(super.position() - 1);
return null;
} else {
throw new InvalidCharacterException(chars[0]);
}
} else if (chars.length == 2) {
int cp = CharUtils.toSupplementary(chars[0], chars[1]).getValue();
if (check(cp)) {
if (scanningOnly) {
super.position(super.position() - 2);
return null;
} else {
throw new InvalidCharacterException(cp);
}
}
}
}
return chars;
}
}

View file

@ -0,0 +1,18 @@
package org.xbib.content.resource;
import java.net.URISyntaxException;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class TestIRI {
@Test
public void testSimple() throws URISyntaxException {
IRI iri = IRI.create("http://validator.w3.org/check?uri=http%3A%2F%2Fr\u00E9sum\u00E9.example.org");
assertEquals("http", iri.getScheme());
assertEquals("validator.w3.org", iri.getHost());
assertEquals("/check", iri.getPath());
assertEquals("//validator.w3.org/check?uri=http%3A%2F%2Frésumé.example.org", iri.getSchemeSpecificPart());
//assertEquals("http://validator.w3.org/check?uri=http%3A%2F%2Fr%C3%A9sum%C3%A9.example.org", iri.toURI().toString());
}
}