cleaning up IRI implementation
This commit is contained in:
parent
3fa5cb456c
commit
7d09b26148
11 changed files with 225 additions and 206 deletions
|
@ -1,7 +1,7 @@
|
|||
package org.xbib.content.resource;
|
||||
|
||||
import org.xbib.content.resource.text.CharUtils;
|
||||
import org.xbib.content.resource.text.CharUtils.Profile;
|
||||
import org.xbib.content.resource.text.Profile;
|
||||
import org.xbib.content.resource.text.InvalidCharacterException;
|
||||
import org.xbib.net.PercentDecoder;
|
||||
import org.xbib.net.PercentEncoders;
|
||||
|
|
|
@ -3,10 +3,9 @@ package org.xbib.content.resource;
|
|||
/**
|
||||
*
|
||||
*/
|
||||
@SuppressWarnings("serial")
|
||||
public class IRISyntaxException extends RuntimeException {
|
||||
|
||||
private static final long serialVersionUID = 1813084470937980392L;
|
||||
|
||||
IRISyntaxException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
package org.xbib.content.resource.text;
|
||||
|
||||
class CharArrayCodepointIterator extends CodepointIterator {
|
||||
protected char[] buffer;
|
||||
|
||||
CharArrayCodepointIterator(char[] buffer) {
|
||||
this(buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
CharArrayCodepointIterator(char[] buffer, int n, int e) {
|
||||
this.buffer = buffer;
|
||||
this.position = n;
|
||||
this.limit = Math.min(buffer.length - n, e);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected char get() {
|
||||
return (position < limit) ? buffer[position++] : (char) -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected char get(int index) {
|
||||
if (index < 0 || index >= limit) {
|
||||
throw new ArrayIndexOutOfBoundsException(index);
|
||||
}
|
||||
return buffer[index];
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package org.xbib.content.resource.text;
|
||||
|
||||
class CharSequenceCodepointIterator extends CodepointIterator {
|
||||
private final CharSequence buffer;
|
||||
|
||||
CharSequenceCodepointIterator(CharSequence buffer) {
|
||||
this(buffer, 0, buffer.length());
|
||||
}
|
||||
|
||||
CharSequenceCodepointIterator(CharSequence buffer, int n, int e) {
|
||||
this.buffer = buffer;
|
||||
this.position = n;
|
||||
this.limit = Math.min(buffer.length() - n, e);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected char get() {
|
||||
return buffer.charAt(position++);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected char get(int index) {
|
||||
return buffer.charAt(index);
|
||||
}
|
||||
}
|
|
@ -6,12 +6,12 @@ package org.xbib.content.resource.text;
|
|||
public final class CharUtils {
|
||||
|
||||
public static final char LRE = 0x202A;
|
||||
private static final char RLE = 0x202B;
|
||||
private static final char LRO = 0x202D;
|
||||
private static final char RLO = 0x202E;
|
||||
private static final char LRM = 0x200E;
|
||||
private static final char RLM = 0x200F;
|
||||
private static final char PDF = 0x202C;
|
||||
public static final char RLE = 0x202B;
|
||||
public static final char LRO = 0x202D;
|
||||
public static final char RLO = 0x202E;
|
||||
public static final char LRM = 0x200E;
|
||||
public static final char RLM = 0x200F;
|
||||
public static final char PDF = 0x202C;
|
||||
|
||||
private CharUtils() {
|
||||
}
|
||||
|
@ -594,56 +594,4 @@ public final class CharUtils {
|
|||
verify(CodepointIterator.forCharSequence(s), profile);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public enum Profile {
|
||||
NONE(codepoint -> true),
|
||||
ALPHA(codepoint -> !isAlpha(codepoint)),
|
||||
ALPHANUM(codepoint -> !isAlphaDigit(codepoint)),
|
||||
FRAGMENT(codepoint -> !isFragment(codepoint)),
|
||||
IFRAGMENT(codepoint -> !isIfragment(codepoint)),
|
||||
PATH(codepoint -> !isPath(codepoint)),
|
||||
IPATH(codepoint -> !isIpath(codepoint)),
|
||||
IUSERINFO(codepoint -> !isIuserinfo(codepoint)),
|
||||
USERINFO(codepoint -> !isUserInfo(codepoint)),
|
||||
QUERY(codepoint -> !isQuery(codepoint)),
|
||||
IQUERY(codepoint -> !isIquery(codepoint)),
|
||||
SCHEME(codepoint -> !isScheme(codepoint)),
|
||||
PATHNODELIMS(codepoint -> !isPathNoDelims(codepoint)),
|
||||
IPATHNODELIMS(codepoint -> !isIpathnodelims(codepoint)),
|
||||
IPATHNODELIMS_SEG(codepoint -> !isIpathnodelims(codepoint) && codepoint != '@' && codepoint != ':'),
|
||||
IREGNAME(codepoint -> !isIregname(codepoint)),
|
||||
IHOST(codepoint -> !isIhost(codepoint)),
|
||||
IPRIVATE(codepoint -> !isIprivate(codepoint)),
|
||||
RESERVED(codepoint -> !isReserved(codepoint)),
|
||||
IUNRESERVED(codepoint -> !isIunreserved(codepoint)),
|
||||
UNRESERVED(codepoint -> !isUnreserved(codepoint)),
|
||||
SCHEMESPECIFICPART(codepoint -> !isIunreserved(codepoint) && !isReserved(codepoint)
|
||||
&& !isIprivate(codepoint)
|
||||
&& !isPctEnc(codepoint)
|
||||
&& codepoint != '#'),
|
||||
AUTHORITY(codepoint -> !isRegname(codepoint) && !isUserInfo(codepoint) && !isGenDelim(codepoint)),
|
||||
ASCIISANSCRLF(codepoint -> !inRange(codepoint, 1, 9) && !inRange(codepoint, 14, 127)),
|
||||
PCT(codepoint -> !isPctEnc(codepoint)),
|
||||
STD3ASCIIRULES(codepoint -> !inRange(codepoint, 0x0000, 0x002C) &&
|
||||
!inRange(codepoint, 0x002E, 0x002F) &&
|
||||
!inRange(codepoint, 0x003A, 0x0040) &&
|
||||
!inRange(codepoint, 0x005B, 0x0060) &&
|
||||
!inRange(codepoint, 0x007B, 0x007F));
|
||||
private final Filter filter;
|
||||
|
||||
Profile(Filter filter) {
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
public Filter filter() {
|
||||
return filter;
|
||||
}
|
||||
|
||||
public boolean check(int codepoint) {
|
||||
return filter.accept(codepoint);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ package org.xbib.content.resource.text;
|
|||
* Filters are used in a variety of ways to filter or verify unicode codepoints.
|
||||
*/
|
||||
@FunctionalInterface
|
||||
public interface Filter {
|
||||
public interface CodepointFilter {
|
||||
|
||||
boolean accept(int c);
|
||||
boolean accept(int ch);
|
||||
}
|
|
@ -33,28 +33,27 @@ public abstract class CodepointIterator implements Iterator<Codepoint> {
|
|||
return new CharSequenceCodepointIterator(seq);
|
||||
}
|
||||
|
||||
|
||||
public static CodepointIterator restrict(CodepointIterator ci, Filter filter) {
|
||||
public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter) {
|
||||
return new RestrictedCodepointIterator(ci, filter, false);
|
||||
}
|
||||
|
||||
public static CodepointIterator restrict(CodepointIterator ci, Filter filter, boolean scanning) {
|
||||
public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter, boolean scanning) {
|
||||
return new RestrictedCodepointIterator(ci, filter, scanning);
|
||||
}
|
||||
|
||||
public static CodepointIterator restrict(CodepointIterator ci, Filter filter, boolean scanning, boolean invert) {
|
||||
public static CodepointIterator restrict(CodepointIterator ci, CodepointFilter filter, boolean scanning, boolean invert) {
|
||||
return new RestrictedCodepointIterator(ci, filter, scanning, invert);
|
||||
}
|
||||
|
||||
public CodepointIterator restrict(Filter filter) {
|
||||
public CodepointIterator restrict(CodepointFilter filter) {
|
||||
return restrict(this, filter);
|
||||
}
|
||||
|
||||
public CodepointIterator restrict(Filter filter, boolean scanning) {
|
||||
public CodepointIterator restrict(CodepointFilter filter, boolean scanning) {
|
||||
return restrict(this, filter, scanning);
|
||||
}
|
||||
|
||||
public CodepointIterator restrict(Filter filter, boolean scanning, boolean invert) {
|
||||
public CodepointIterator restrict(CodepointFilter filter, boolean scanning, boolean invert) {
|
||||
return restrict(this, filter, scanning, invert);
|
||||
}
|
||||
|
||||
|
@ -266,136 +265,4 @@ public abstract class CodepointIterator implements Iterator<Codepoint> {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
private static class CharArrayCodepointIterator extends CodepointIterator {
|
||||
protected char[] buffer;
|
||||
|
||||
CharArrayCodepointIterator(char[] buffer) {
|
||||
this(buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
CharArrayCodepointIterator(char[] buffer, int n, int e) {
|
||||
this.buffer = buffer;
|
||||
this.position = n;
|
||||
this.limit = Math.min(buffer.length - n, e);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected char get() {
|
||||
return (position < limit) ? buffer[position++] : (char) -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected char get(int index) {
|
||||
if (index < 0 || index >= limit) {
|
||||
throw new ArrayIndexOutOfBoundsException(index);
|
||||
}
|
||||
return buffer[index];
|
||||
}
|
||||
}
|
||||
|
||||
private static class CharSequenceCodepointIterator extends CodepointIterator {
|
||||
private CharSequence buffer;
|
||||
|
||||
CharSequenceCodepointIterator(CharSequence buffer) {
|
||||
this(buffer, 0, buffer.length());
|
||||
}
|
||||
|
||||
CharSequenceCodepointIterator(CharSequence buffer, int n, int e) {
|
||||
this.buffer = buffer;
|
||||
this.position = n;
|
||||
this.limit = Math.min(buffer.length() - n, e);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected char get() {
|
||||
return buffer.charAt(position++);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected char get(int index) {
|
||||
return buffer.charAt(index);
|
||||
}
|
||||
}
|
||||
|
||||
private static class RestrictedCodepointIterator extends DelegatingCodepointIterator {
|
||||
|
||||
private final Filter filter;
|
||||
private final boolean scanningOnly;
|
||||
private final boolean notset;
|
||||
|
||||
RestrictedCodepointIterator(CodepointIterator internal, Filter filter, boolean scanningOnly) {
|
||||
this(internal, filter, scanningOnly, false);
|
||||
}
|
||||
|
||||
RestrictedCodepointIterator(CodepointIterator internal,
|
||||
Filter filter,
|
||||
boolean scanningOnly,
|
||||
boolean notset) {
|
||||
super(internal);
|
||||
this.filter = filter;
|
||||
this.scanningOnly = scanningOnly;
|
||||
this.notset = notset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
boolean b = super.hasNext();
|
||||
if (scanningOnly) {
|
||||
try {
|
||||
int cp = super.peek(super.position()).getValue();
|
||||
if (b && cp != -1 && check(cp)) {
|
||||
return false;
|
||||
}
|
||||
} catch (InvalidCharacterException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Codepoint next() {
|
||||
Codepoint cp = super.next();
|
||||
int v = cp.getValue();
|
||||
if (v != -1 && check(v)) {
|
||||
if (scanningOnly) {
|
||||
super.position(super.position() - 1);
|
||||
return null;
|
||||
} else {
|
||||
throw new InvalidCharacterException(v);
|
||||
}
|
||||
}
|
||||
return cp;
|
||||
}
|
||||
|
||||
private boolean check(int cp) {
|
||||
return notset == !filter.accept(cp);
|
||||
}
|
||||
|
||||
@Override
|
||||
public char[] nextChars() {
|
||||
char[] chars = super.nextChars();
|
||||
if (chars != null && chars.length > 0) {
|
||||
if (chars.length == 1 && check(chars[0])) {
|
||||
if (scanningOnly) {
|
||||
super.position(super.position() - 1);
|
||||
return null;
|
||||
} else {
|
||||
throw new InvalidCharacterException(chars[0]);
|
||||
}
|
||||
} else if (chars.length == 2) {
|
||||
int cp = CharUtils.toSupplementary(chars[0], chars[1]).getValue();
|
||||
if (check(cp)) {
|
||||
if (scanningOnly) {
|
||||
super.position(super.position() - 2);
|
||||
return null;
|
||||
} else {
|
||||
throw new InvalidCharacterException(cp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return chars;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
package org.xbib.content.resource.text;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
@SuppressWarnings("serial")
|
||||
public class InvalidCharacterException extends RuntimeException {
|
||||
|
||||
private static final long serialVersionUID = -3037013255350562940L;
|
||||
private final int input;
|
||||
|
||||
public InvalidCharacterException(int input) {
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
package org.xbib.content.resource.text;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public enum Profile {
|
||||
NONE(codepoint -> true),
|
||||
ALPHA(codepoint -> !CharUtils.isAlpha(codepoint)),
|
||||
ALPHANUM(codepoint -> !CharUtils.isAlphaDigit(codepoint)),
|
||||
FRAGMENT(codepoint -> !CharUtils.isFragment(codepoint)),
|
||||
IFRAGMENT(codepoint -> !CharUtils.isIfragment(codepoint)),
|
||||
PATH(codepoint -> !CharUtils.isPath(codepoint)),
|
||||
IPATH(codepoint -> !CharUtils.isIpath(codepoint)),
|
||||
IUSERINFO(codepoint -> !CharUtils.isIuserinfo(codepoint)),
|
||||
USERINFO(codepoint -> !CharUtils.isUserInfo(codepoint)),
|
||||
QUERY(codepoint -> !CharUtils.isQuery(codepoint)),
|
||||
IQUERY(codepoint -> !CharUtils.isIquery(codepoint)),
|
||||
SCHEME(codepoint -> !CharUtils.isScheme(codepoint)),
|
||||
PATHNODELIMS(codepoint -> !CharUtils.isPathNoDelims(codepoint)),
|
||||
IPATHNODELIMS(codepoint -> !CharUtils.isIpathnodelims(codepoint)),
|
||||
IPATHNODELIMS_SEG(codepoint -> !CharUtils.isIpathnodelims(codepoint) && codepoint != '@' && codepoint != ':'),
|
||||
IREGNAME(codepoint -> !CharUtils.isIregname(codepoint)),
|
||||
IHOST(codepoint -> !CharUtils.isIhost(codepoint)),
|
||||
IPRIVATE(codepoint -> !CharUtils.isIprivate(codepoint)),
|
||||
RESERVED(codepoint -> !CharUtils.isReserved(codepoint)),
|
||||
IUNRESERVED(codepoint -> !CharUtils.isIunreserved(codepoint)),
|
||||
UNRESERVED(codepoint -> !CharUtils.isUnreserved(codepoint)),
|
||||
SCHEMESPECIFICPART(codepoint -> !CharUtils.isIunreserved(codepoint) && !CharUtils.isReserved(codepoint)
|
||||
&& !CharUtils.isIprivate(codepoint)
|
||||
&& !CharUtils.isPctEnc(codepoint)
|
||||
&& codepoint != '#'),
|
||||
AUTHORITY(codepoint -> !CharUtils.isRegname(codepoint) && !CharUtils.isUserInfo(codepoint) && !CharUtils.isGenDelim(codepoint)),
|
||||
ASCIISANSCRLF(codepoint -> !CharUtils.inRange(codepoint, 1, 9) && !CharUtils.inRange(codepoint, 14, 127)),
|
||||
PCT(codepoint -> !CharUtils.isPctEnc(codepoint)),
|
||||
STD3ASCIIRULES(codepoint -> !CharUtils.inRange(codepoint, 0x0000, 0x002C) &&
|
||||
!CharUtils.inRange(codepoint, 0x002E, 0x002F) &&
|
||||
!CharUtils.inRange(codepoint, 0x003A, 0x0040) &&
|
||||
!CharUtils.inRange(codepoint, 0x005B, 0x0060) &&
|
||||
!CharUtils.inRange(codepoint, 0x007B, 0x007F));
|
||||
|
||||
private final CodepointFilter filter;
|
||||
|
||||
Profile(CodepointFilter filter) {
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
public CodepointFilter filter() {
|
||||
return filter;
|
||||
}
|
||||
|
||||
public boolean check(int codepoint) {
|
||||
return filter.accept(codepoint);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
package org.xbib.content.resource.text;
|
||||
|
||||
class RestrictedCodepointIterator extends DelegatingCodepointIterator {
|
||||
|
||||
private final CodepointFilter filter;
|
||||
private final boolean scanningOnly;
|
||||
private final boolean notset;
|
||||
|
||||
RestrictedCodepointIterator(CodepointIterator internal, CodepointFilter filter, boolean scanningOnly) {
|
||||
this(internal, filter, scanningOnly, false);
|
||||
}
|
||||
|
||||
RestrictedCodepointIterator(CodepointIterator internal,
|
||||
CodepointFilter filter,
|
||||
boolean scanningOnly,
|
||||
boolean notset) {
|
||||
super(internal);
|
||||
this.filter = filter;
|
||||
this.scanningOnly = scanningOnly;
|
||||
this.notset = notset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
boolean b = super.hasNext();
|
||||
if (scanningOnly) {
|
||||
try {
|
||||
int cp = super.peek(super.position()).getValue();
|
||||
if (b && cp != -1 && check(cp)) {
|
||||
return false;
|
||||
}
|
||||
} catch (InvalidCharacterException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Codepoint next() {
|
||||
Codepoint cp = super.next();
|
||||
int v = cp.getValue();
|
||||
if (v != -1 && check(v)) {
|
||||
if (scanningOnly) {
|
||||
super.position(super.position() - 1);
|
||||
return null;
|
||||
} else {
|
||||
throw new InvalidCharacterException(v);
|
||||
}
|
||||
}
|
||||
return cp;
|
||||
}
|
||||
|
||||
private boolean check(int cp) {
|
||||
return notset == !filter.accept(cp);
|
||||
}
|
||||
|
||||
@Override
|
||||
public char[] nextChars() {
|
||||
char[] chars = super.nextChars();
|
||||
if (chars != null && chars.length > 0) {
|
||||
if (chars.length == 1 && check(chars[0])) {
|
||||
if (scanningOnly) {
|
||||
super.position(super.position() - 1);
|
||||
return null;
|
||||
} else {
|
||||
throw new InvalidCharacterException(chars[0]);
|
||||
}
|
||||
} else if (chars.length == 2) {
|
||||
int cp = CharUtils.toSupplementary(chars[0], chars[1]).getValue();
|
||||
if (check(cp)) {
|
||||
if (scanningOnly) {
|
||||
super.position(super.position() - 2);
|
||||
return null;
|
||||
} else {
|
||||
throw new InvalidCharacterException(cp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return chars;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
package org.xbib.content.resource;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class TestIRI {
|
||||
|
||||
@Test
|
||||
public void testSimple() throws URISyntaxException {
|
||||
IRI iri = IRI.create("http://validator.w3.org/check?uri=http%3A%2F%2Fr\u00E9sum\u00E9.example.org");
|
||||
assertEquals("http", iri.getScheme());
|
||||
assertEquals("validator.w3.org", iri.getHost());
|
||||
assertEquals("/check", iri.getPath());
|
||||
assertEquals("//validator.w3.org/check?uri=http%3A%2F%2Frésumé.example.org", iri.getSchemeSpecificPart());
|
||||
//assertEquals("http://validator.w3.org/check?uri=http%3A%2F%2Fr%C3%A9sum%C3%A9.example.org", iri.toURI().toString());
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue