update to gradle 7.3.2, add bgzf (MIT License)

This commit is contained in:
Jörg Prante 2022-05-25 10:41:50 +02:00
parent 9ca8990bf0
commit 337e1c19c0
24 changed files with 2588 additions and 113 deletions

View file

@ -1,5 +1,5 @@
group = org.xbib
name = archive
version = 1.0.1
version = 1.1.0
gradle.wrapper.version = 6.6.1
gradle.wrapper.version = 7.3.2

Binary file not shown.

View file

@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-6.6.1-all.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.2-all.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists

269
gradlew vendored
View file

@ -1,7 +1,7 @@
#!/usr/bin/env sh
#!/bin/sh
#
# Copyright 2015 the original author or authors.
# Copyright © 2015-2021 the original authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -17,67 +17,101 @@
#
##############################################################################
##
## Gradle start up script for UN*X
##
#
# Gradle start up script for POSIX generated by Gradle.
#
# Important for running:
#
# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
# noncompliant, but you have some other compliant shell such as ksh or
# bash, then to run this script, type that shell name before the whole
# command line, like:
#
# ksh Gradle
#
# Busybox and similar reduced shells will NOT work, because this script
# requires all of these POSIX shell features:
# * functions;
# * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
# «${var#prefix}», «${var%suffix}», and «$( cmd )»;
# * compound commands having a testable exit status, especially «case»;
# * various built-in commands including «command», «set», and «ulimit».
#
# Important for patching:
#
# (2) This script targets any POSIX shell, so it avoids extensions provided
# by Bash, Ksh, etc; in particular arrays are avoided.
#
# The "traditional" practice of packing multiple parameters into a
# space-separated string is a well documented source of bugs and security
# problems, so this is (mostly) avoided, by progressively accumulating
# options in "$@", and eventually passing that to Java.
#
# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
# see the in-line comments for details.
#
# There are tweaks for specific operating systems such as AIX, CygWin,
# Darwin, MinGW, and NonStop.
#
# (3) This script is generated from the Groovy template
# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# within the Gradle project.
#
# You can find Gradle at https://github.com/gradle/gradle/.
#
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
app_path=$0
# Need this for daisy-chained symlinks.
while
APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path
[ -h "$app_path" ]
do
ls=$( ls -ld "$app_path" )
link=${ls#*' -> '}
case $link in #(
/*) app_path=$link ;; #(
*) app_path=$APP_HOME$link ;;
esac
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`
APP_BASE_NAME=${0##*/}
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"
MAX_FD=maximum
warn () {
echo "$*"
}
} >&2
die () {
echo
echo "$*"
echo
exit 1
}
} >&2
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
;;
Darwin* )
darwin=true
;;
MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
case "$( uname )" in #(
CYGWIN* ) cygwin=true ;; #(
Darwin* ) darwin=true ;; #(
MSYS* | MINGW* ) msys=true ;; #(
NONSTOP* ) nonstop=true ;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
@ -87,9 +121,9 @@ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
JAVACMD=$JAVA_HOME/jre/sh/java
else
JAVACMD="$JAVA_HOME/bin/java"
JAVACMD=$JAVA_HOME/bin/java
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
@ -98,7 +132,7 @@ Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD="java"
JAVACMD=java
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
@ -106,80 +140,95 @@ location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
MAX_FD="$MAX_FD_LIMIT"
fi
ulimit -n $MAX_FD
if [ $? -ne 0 ] ; then
warn "Could not set maximum file descriptor limit: $MAX_FD"
fi
else
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
fi
fi
# For Darwin, add options to specify how the application appears in the dock
if $darwin; then
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
fi
# For Cygwin or MSYS, switch paths to Windows format before running java
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
JAVACMD=`cygpath --unix "$JAVACMD"`
# We build the pattern for arguments to be converted via cygpath
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
SEP=""
for dir in $ROOTDIRSRAW ; do
ROOTDIRS="$ROOTDIRS$SEP$dir"
SEP="|"
done
OURCYGPATTERN="(^($ROOTDIRS))"
# Add a user-defined pattern to the cygpath arguments
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
fi
# Now convert the arguments - kludge to limit ourselves to /bin/sh
i=0
for arg in "$@" ; do
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
else
eval `echo args$i`="\"$arg\""
fi
i=`expr $i + 1`
done
case $i in
0) set -- ;;
1) set -- "$args0" ;;
2) set -- "$args0" "$args1" ;;
3) set -- "$args0" "$args1" "$args2" ;;
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
case $MAX_FD in #(
max*)
MAX_FD=$( ulimit -H -n ) ||
warn "Could not query maximum file descriptor limit"
esac
case $MAX_FD in #(
'' | soft) :;; #(
*)
ulimit -n "$MAX_FD" ||
warn "Could not set maximum file descriptor limit to $MAX_FD"
esac
fi
# Escape application args
save () {
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
echo " "
}
APP_ARGS=`save "$@"`
# Collect all arguments for the java command, stacking in reverse order:
# * args from the command line
# * the main class name
# * -classpath
# * -D...appname settings
# * --module-path (only if needed)
# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
# Collect all arguments for the java command, following the shell quoting and substitution rules
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
# For Cygwin or MSYS, switch paths to Windows format before running java
if "$cygwin" || "$msys" ; then
APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
JAVACMD=$( cygpath --unix "$JAVACMD" )
# Now convert the arguments - kludge to limit ourselves to /bin/sh
for arg do
if
case $arg in #(
-*) false ;; # don't mess with options #(
/?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath
[ -e "$t" ] ;; #(
*) false ;;
esac
then
arg=$( cygpath --path --ignore --mixed "$arg" )
fi
# Roll the args list around exactly as many times as the number of
# args, so each arg winds up back in the position where it started, but
# possibly modified.
#
# NB: a `for` loop captures its iteration list before it begins, so
# changing the positional parameters here affects neither the number of
# iterations, nor the values presented in `arg`.
shift # remove old arg
set -- "$@" "$arg" # push replacement arg
done
fi
# Collect all arguments for the java command;
# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
# shell script including quotes and variable substitutions, so put them in
# double quotes to make sure that they get re-expanded; and
# * put everything else in single quotes, so that it's not re-expanded.
set -- \
"-Dorg.gradle.appname=$APP_BASE_NAME" \
-classpath "$CLASSPATH" \
org.gradle.wrapper.GradleWrapperMain \
"$@"
# Use "xargs" to parse quoted args.
#
# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
#
# In Bash we could simply go:
#
# readarray ARGS < <( xargs -n1 <<<"$var" ) &&
# set -- "${ARGS[@]}" "$@"
#
# but POSIX shell has neither arrays nor command substitution, so instead we
# post-process each arg (as a line of input to sed) to backslash-escape any
# character that might be a shell metacharacter, then use eval to reverse
# that process (while maintaining the separation between arguments), and wrap
# the whole thing up as a single "set" statement.
#
# This will of course break if any of these variables contains a newline or
# an unmatched quote.
#
eval "set -- $(
printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
xargs -n1 |
sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
tr '\n' ' '
)" '"$@"'
exec "$JAVACMD" "$@"

View file

@ -1,5 +1,6 @@
dependencies {
api project(':io-archive')
implementation project(':io-compress-bgzf')
implementation project(':io-compress-bzip2')
implementation project(':io-compress-lzf')
implementation project(':io-compress-xz')

View file

@ -2,11 +2,13 @@ module org.xbib.io.codec {
uses org.xbib.io.codec.StreamCodec;
exports org.xbib.io.codec;
exports org.xbib.io.codec.ar;
exports org.xbib.io.codec.bgzf;
exports org.xbib.io.codec.cpio;
exports org.xbib.io.codec.file;
exports org.xbib.io.codec.jar;
exports org.xbib.io.codec.tar;
exports org.xbib.io.codec.zip;
requires transitive org.xbib.io.compress.bgzf;
requires org.xbib.io.compress.bzip;
requires org.xbib.io.compress.lzf;
requires org.xbib.io.compress.xz;

View file

@ -0,0 +1,37 @@
package org.xbib.io.codec.bgzf;
import org.xbib.io.codec.StreamCodec;
import org.xbib.io.compress.bgzf.BlockCompressedInputStream;
import org.xbib.io.compress.bgzf.BlockCompressedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
public class BzgfStreamCodec implements StreamCodec<BlockCompressedInputStream, BlockCompressedOutputStream> {
@Override
public String getName() {
return "bgzf";
}
@Override
public BlockCompressedInputStream decode(InputStream in) throws IOException {
return new BlockCompressedInputStream(in);
}
@Override
public BlockCompressedInputStream decode(InputStream in, int bufsize) throws IOException {
return new BlockCompressedInputStream(in);
}
@Override
public BlockCompressedOutputStream encode(OutputStream out) throws IOException {
return new BlockCompressedOutputStream(out);
}
@Override
public BlockCompressedOutputStream encode(OutputStream out, int bufsize) throws IOException {
return new BlockCompressedOutputStream(out);
}
}

View file

@ -0,0 +1,21 @@
/*
* The MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

View file

@ -0,0 +1,3 @@
module org.xbib.io.compress.bgzf {
exports org.xbib.io.compress.bgzf;
}

View file

@ -0,0 +1,19 @@
package org.xbib.io.compress.bgzf;
@SuppressWarnings("serial")
public class BGZFException extends RuntimeException {
public BGZFException() {}
public BGZFException(final String s) {
super(s);
}
public BGZFException(final String s, final Throwable throwable) {
super(s, throwable);
}
public BGZFException(final Throwable throwable) {
super(throwable);
}
}

View file

@ -0,0 +1,67 @@
package org.xbib.io.compress.bgzf;
public class BGZFFilePointerUtil {
private static final int SHIFT_AMOUNT = 16;
private static final int OFFSET_MASK = 0xffff;
private static final long ADDRESS_MASK = 0xFFFFFFFFFFFFL;
public static final long MAX_BLOCK_ADDRESS = ADDRESS_MASK;
public static final int MAX_OFFSET = OFFSET_MASK;
public static int compare(final long vfp1, final long vfp2) {
if (vfp1 == vfp2) return 0;
// When treating as unsigned, negative number is > positive.
if (vfp1 < 0 && vfp2 >= 0) return 1;
if (vfp1 >= 0 && vfp2 < 0) return -1;
// Either both negative or both non-negative, so regular comparison works.
if (vfp1 < vfp2) return -1;
return 1; // vfp1 > vfp2
}
/**
* @return true if vfp2 points to somewhere in the same BGZF block, or the one immediately
* following vfp1's BGZF block.
*/
public static boolean areInSameOrAdjacentBlocks(final long vfp1, final long vfp2) {
final long block1 = getBlockAddress(vfp1);
final long block2 = getBlockAddress(vfp2);
return (block1 == block2 || block1 + 1 == block2);
}
/**
* @param blockAddress File offset of start of BGZF block.
* @param blockOffset Offset into uncompressed block.
* @return Virtual file pointer that embodies the input parameters.
*/
static long makeFilePointer(final long blockAddress, final int blockOffset) {
if (blockOffset < 0) {
throw new IllegalArgumentException("Negative blockOffset " + blockOffset
+ " not allowed.");
}
if (blockAddress < 0) {
throw new IllegalArgumentException("Negative blockAddress " + blockAddress
+ " not allowed.");
}
if (blockOffset > MAX_OFFSET) {
throw new IllegalArgumentException("blockOffset " + blockOffset + " too large.");
}
if (blockAddress > MAX_BLOCK_ADDRESS) {
throw new IllegalArgumentException("blockAddress " + blockAddress + " too large.");
}
return blockAddress << SHIFT_AMOUNT | blockOffset;
}
public static long getBlockAddress(final long virtualFilePointer) {
return (virtualFilePointer >> SHIFT_AMOUNT) & ADDRESS_MASK;
}
public static int getBlockOffset(final long virtualFilePointer) {
return (int)(virtualFilePointer & OFFSET_MASK);
}
public static String asString(final long vfp) {
return String.format("%d(0x%x): (block address: %d, offset: %d)", vfp, vfp, getBlockAddress(vfp), getBlockOffset(vfp));
}
}

View file

@ -0,0 +1,19 @@
package org.xbib.io.compress.bgzf;
@SuppressWarnings("serial")
public class BGZFFormatException extends BGZFException {
public BGZFFormatException() {}
public BGZFFormatException(final String s) {
super(s);
}
public BGZFFormatException(final String s, final Throwable throwable) {
super(s, throwable);
}
public BGZFFormatException(final Throwable throwable) {
super(throwable);
}
}

View file

@ -0,0 +1,95 @@
package org.xbib.io.compress.bgzf;
/**
* Constants shared by BlockCompressed{Input,Output}Stream classes
*/
public class BGZFStreamConstants {
// Number of bytes in the gzip block before the deflated data.
// This is not the standard header size, because we include one optional subfield,
// but it is the standard for us.
public static final int BLOCK_HEADER_LENGTH = 18;
// Location in the gzip block of the total block size (actually total block size - 1)
public static final int BLOCK_LENGTH_OFFSET = 16;
// Number of bytes that follow the deflated data
public static final int BLOCK_FOOTER_LENGTH = 8;
// We require that a compressed block (including header and footer, be <= this)
public static final int MAX_COMPRESSED_BLOCK_SIZE = 64 * 1024;
// Gzip overhead is the header, the footer, and the block size (encoded as a short).
public static final int GZIP_OVERHEAD = BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH + 2;
// If Deflater has compression level == NO_COMPRESSION, 10 bytes of overhead (determined experimentally).
public static final int NO_COMPRESSION_OVERHEAD = 10;
// Push out a gzip block when this many uncompressed bytes have been accumulated.
// This size is selected so that if data is not compressible, if Deflater is given
// compression level == NO_COMPRESSION, compressed size is guaranteed to be <= MAX_COMPRESSED_BLOCK_SIZE.
public static final int DEFAULT_UNCOMPRESSED_BLOCK_SIZE = 64 * 1024 - (GZIP_OVERHEAD + NO_COMPRESSION_OVERHEAD);
// Magic numbers
public static final byte GZIP_ID1 = 31;
public static final int GZIP_ID2 = 139;
// FEXTRA flag means there are optional fields
public static final int GZIP_FLG = 4;
// extra flags
public static final int GZIP_XFL = 0;
// length of extra subfield
public static final short GZIP_XLEN = 6;
// The deflate compression, which is customarily used by gzip
public static final byte GZIP_CM_DEFLATE = 8;
public static final int DEFAULT_COMPRESSION_LEVEL = 5;
// We don't care about OS because we're not doing line terminator translation
public static final int GZIP_OS_UNKNOWN = 255;
// The subfield ID
public static final byte BGZF_ID1 = 66;
public static final byte BGZF_ID2 = 67;
// subfield length in bytes
public static final byte BGZF_LEN = 2;
public static final byte[] EMPTY_GZIP_BLOCK = {
BGZFStreamConstants.GZIP_ID1,
(byte)BGZFStreamConstants.GZIP_ID2,
BGZFStreamConstants.GZIP_CM_DEFLATE,
BGZFStreamConstants.GZIP_FLG,
0, 0, 0, 0, // Modification time
BGZFStreamConstants.GZIP_XFL,
(byte)BGZFStreamConstants.GZIP_OS_UNKNOWN,
BGZFStreamConstants.GZIP_XLEN, 0, // Little-endian short
BGZFStreamConstants.BGZF_ID1,
BGZFStreamConstants.BGZF_ID2,
BGZFStreamConstants.BGZF_LEN, 0, // Little-endian short
// Total block size - 1
BGZFStreamConstants.BLOCK_HEADER_LENGTH +
BGZFStreamConstants.BLOCK_FOOTER_LENGTH - 1 + 2, 0, // Little-endian short
// Dummy payload?
3, 0,
0, 0, 0, 0, // crc
0, 0, 0, 0, // uncompressedSize
};
public static final byte[] GZIP_BLOCK_PREAMBLE = {
BGZFStreamConstants.GZIP_ID1,
(byte)BGZFStreamConstants.GZIP_ID2,
BGZFStreamConstants.GZIP_CM_DEFLATE,
BGZFStreamConstants.GZIP_FLG,
0, 0, 0, 0, // Modification time
BGZFStreamConstants.GZIP_XFL,
(byte)BGZFStreamConstants.GZIP_OS_UNKNOWN,
BGZFStreamConstants.GZIP_XLEN, 0, // Little-endian short
BGZFStreamConstants.BGZF_ID1,
BGZFStreamConstants.BGZF_ID2,
BGZFStreamConstants.BGZF_LEN, 0, // Little-endian short
};
}

View file

@ -0,0 +1,666 @@
package org.xbib.io.compress.bgzf;
import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.SyncFailedException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
/**
* Encapsulates file representation of various primitive data types. Forces little-endian disk
* representation. Note that this class is currently not very efficient. There are plans to increase
* the size of the ByteBuffer, and move data between the ByteBuffer and the underlying input or
* output stream in larger chunks.
*
* All the read methods throw EOFException if the input stream is exhausted before the
* required number of bytes are read.
*/
public class BinaryCodec {
// Outstream to write to
private OutputStream outputStream;
// If a file or filename was given it will be stored here. Used for error reporting.
private String outputFileName;
// Input stream to read from
private InputStream inputStream;
// If a file or filename was give to read from it will be stored here. Used for error reporting.
private String inputFileName;
/*
* Mode that the BinaryCodec is in. It is either writing to a binary file or reading from. This
* is set to true if it is writing to a binary file Right now we don't support reading and
* writing to the same file with the same BinaryCodec instance
*/
private boolean isWriting;
/**
* For byte swapping.
*/
private ByteBuffer byteBuffer;
/**
* For reading Strings of known length, this can reduce object creation
*/
private final byte[] scratchBuffer = new byte[16];
// Byte order used in BAM files.
private static final ByteOrder LITTLE_ENDIAN = ByteOrder.LITTLE_ENDIAN;
private static final byte[] NULL_BYTE = {0};
private static final long MAX_UBYTE = (Byte.MAX_VALUE * 2) + 1;
private static final long MAX_USHORT = (Short.MAX_VALUE * 2) + 1;
private static final long MAX_UINT = ((long)Integer.MAX_VALUE * 2) + 1;
// We never serialize more than this much at a time (except for Strings)
private static final int MAX_BYTE_BUFFER = 8;
/**
* Constructs BinaryCodec from a file and set it's mode to writing or not
*
* @param file file to be written to or read from
* @param writing whether the file is being written to
* @throws FileNotFoundException
*/
public BinaryCodec(final File file, final boolean writing) throws FileNotFoundException {
this();
this.isWriting = writing;
if (this.isWriting) {
this.outputStream = new FileOutputStream(file);
this.outputFileName = file.getName();
} else {
this.inputStream = new FileInputStream(file);
this.inputFileName = file.getName();
}
}
/**
* Constructs BinaryCodec from a file name and set it's mode to writing or not
*
* @param fileName name of the file to be written to or read from
* @param writing writing whether the file is being written to
* @throws FileNotFoundException
*/
public BinaryCodec(final String fileName, final boolean writing) throws FileNotFoundException {
this(new File(fileName), writing);
}
/**
* Constructs BinaryCodec from an output stream
*
* @param outputStream Stream to write to, since it's an output stream we know that isWriting
* should be set to true
*/
public BinaryCodec(final OutputStream outputStream) {
this();
setOutputStream(outputStream);
}
/**
* Constructs BinaryCodec from an input stream
*
* @param inputStream Stream to read from, since we are reading isWriting is set to false
*/
public BinaryCodec(final InputStream inputStream) {
this();
setInputStream(inputStream);
}
/**
* Ambiguous whether reading or writing until set{In,Out}putStream is called
*/
public BinaryCodec() {
initByteBuffer();
}
/**
* Shared among ctors. Note that if endianness is changed, all the unsigned methods must also be
* changed.
*/
private void initByteBuffer() {
byteBuffer = ByteBuffer.allocate(MAX_BYTE_BUFFER);
byteBuffer.order(LITTLE_ENDIAN);
}
/**
* Write whatever has been put into the byte buffer
*
* @param numBytes -- how much to write. Note that in case of writing an unsigned value, more
* bytes were put into the ByteBuffer than will get written out.
* @throws IOException
*/
private void writeByteBuffer(final int numBytes) throws IOException {
assert (numBytes <= byteBuffer.limit());
writeBytes(byteBuffer.array(), 0, numBytes);
}
/**
* Writes a byte to the output buffer
*
* @param bite byte array to write
* @throws IOException
*/
public void writeByte(final byte bite) throws IOException {
byteBuffer.clear();
byteBuffer.put(bite);
writeByteBuffer(1);
}
public void writeByte(final int b) throws IOException {
writeByte((byte)b);
}
/**
* Writes a byte array to the output buffer
*
* @param bytes value to write
* @throws IOException
*/
public void writeBytes(final byte[] bytes) throws IOException {
writeBytes(bytes, 0, bytes.length);
}
public void writeBytes(final byte[] bytes, final int startOffset, final int numBytes) throws IOException {
if (!isWriting) {
throw new IllegalStateException("Calling write method on BinaryCodec open for read.");
}
outputStream.write(bytes, startOffset, numBytes);
}
/**
* Write a 32-bit int to the output stream
*
* @param value int to write
* @throws IOException
*/
public void writeInt(final int value) throws IOException {
byteBuffer.clear();
byteBuffer.putInt(value);
writeByteBuffer(4);
}
/**
* Write a double (8 bytes) to the output stream
*
* @param value double to write
* @throws IOException
*/
public void writeDouble(final double value) throws IOException {
byteBuffer.clear();
byteBuffer.putDouble(value);
writeByteBuffer(8);
}
/**
* Write a 64-bit long to the output stream
*
* @param value long to write
* @throws IOException
*/
public void writeLong(final long value) throws IOException {
byteBuffer.clear();
byteBuffer.putLong(value);
writeByteBuffer(8);
}
/**
* Write a 16-bit short to output stream
*
* @throws IOException
*/
public void writeShort(final short value) throws IOException {
byteBuffer.clear();
byteBuffer.putShort(value);
writeByteBuffer(2);
}
/**
* Write a float (4 bytes) to the output stream
*
* @param value float to write
* @throws IOException
*/
public void writeFloat(final float value) throws IOException {
byteBuffer.clear();
byteBuffer.putFloat(value);
writeByteBuffer(4);
}
/**
* Writes a boolean (1 byte) to the output buffer
*
* @param value boolean to write
* @throws IOException
*/
public void writeBoolean(final boolean value) throws IOException {
byteBuffer.clear();
byteBuffer.put(value ? (byte)1 : (byte)0);
writeByteBuffer(1);
}
/**
* Writes a string to the buffer as ASCII bytes
*
* @param value string to write to buffer
* @param writeLength prefix the string with the length as a 32-bit int
* @param appendNull add a null byte to the end of the string
* @throws IOException
*/
public void writeString(final String value, final boolean writeLength, final boolean appendNull) throws IOException {
if (writeLength) {
int lengthToWrite = value.length();
if (appendNull) lengthToWrite++;
writeInt(lengthToWrite);
}
// Actually writes the string to a buffer
writeString(value);
if (appendNull) writeBytes(NULL_BYTE);
}
/**
* Write a string to the buffer as ASCII bytes
*
* @param value string to write
* @throws IOException
*/
private void writeString(final String value) throws IOException {
final byte[] byteBuffer = new byte[value.length()];
final char[] charBuffer = value.toCharArray();
for (int i = 0; i < charBuffer.length; ++i) {
byteBuffer[i] = (byte)(charBuffer[i] & 0xff);
}
writeBytes(byteBuffer);
}
/**
* Write an 8-bit unsigned byte. NOTE: This method will break if we change to big-endian.
*
* @throws IOException
*/
public void writeUByte(final short val) throws IOException {
if (val < 0) {
throw new IllegalArgumentException("Negative value (" + val
+ ") passed to unsigned writing method.");
}
if (val > MAX_UBYTE) {
throw new IllegalArgumentException("Value (" + val
+ ") to large to be written as ubyte.");
}
byteBuffer.clear();
byteBuffer.putShort(val);
writeByteBuffer(1);
}
/**
* Write a 16-bit unsigned short. NOTE: This method will break if we change to big-endian.
*
* @throws IOException
*/
public void writeUShort(final int val) throws IOException {
if (val < 0) {
throw new IllegalArgumentException("Negative value (" + val
+ ") passed to unsigned writing method.");
}
if (val > MAX_USHORT) {
throw new IllegalArgumentException("Value (" + val
+ ") to large to be written as ushort.");
}
byteBuffer.clear();
byteBuffer.putInt(val);
writeByteBuffer(2);
}
/**
* Write a 32-bit unsigned int. NOTE: This method will break if we change to big-endian.
*
* @throws IOException
*/
public void writeUInt(final long val) throws IOException {
if (val < 0) {
throw new IllegalArgumentException("Negative value (" + val
+ ") passed to unsigned writing method.");
}
if (val > MAX_UINT) {
throw new IllegalArgumentException("Value (" + val
+ ") to large to be written as uint.");
}
byteBuffer.clear();
byteBuffer.putLong(val);
writeByteBuffer(4);
}
/**
* Read a byte array from the input stream.
*
* @throws IOException
*/
public void readBytes(final byte[] buffer) throws IOException {
readBytes(buffer, 0, buffer.length);
}
/**
* Read a byte array from the input stream
*
* @param buffer where to put bytes read
* @param offset offset to start putting bytes into buffer
* @param length number of bytes to read
* @throws IOException
*/
public void readBytes(final byte[] buffer, final int offset, final int length) throws IOException {
int totalNumRead = 0;
do {
final int numRead =
readBytesOrFewer(buffer, offset + totalNumRead, length - totalNumRead);
if (numRead < 0) {
throw new EOFException(constructErrorMessage("Premature EOF"));
} else {
totalNumRead += numRead;
}
} while (totalNumRead < length);
}
/**
* Reads a byte array from the input stream.
*
* @param buffer where to put bytes read
* @param offset offset to start putting bytes into buffer
* @param length number of bytes to read. Fewer bytes may be read if EOF is reached before
* length bytes have been read.
* @return the total number of bytes read into the buffer, or -1 if there is no more data
* because the end of the stream has been reached.
* @throws IOException
*/
public int readBytesOrFewer(final byte[] buffer, final int offset, final int length) throws IOException {
if (isWriting) {
throw new IllegalStateException("Calling read method on BinaryCodec open for write.");
}
return inputStream.read(buffer, offset, length);
}
/**
* @return a single byte read from the input stream.
* @throws IOException
*/
public byte readByte() throws IOException {
if (isWriting) {
throw new IllegalStateException("Calling read method on BinaryCodec open for write.");
}
final int ret = inputStream.read();
if (ret == -1) {
throw new EOFException(constructErrorMessage("Premature EOF"));
}
return (byte)ret;
}
/**
* @return true if it is possible to know for sure if at EOF, and it is known for sure. If the
* input stream is a ByteArrayInputStream, this is faster than causing a
* RuntimeEOFException to be thrown.
* @throws IOException
*/
public boolean knownAtEof() throws IOException {
if (isWriting) {
throw new IllegalStateException(
"Calling knownAtEof method on BinaryCodec open for write.");
}
return inputStream instanceof ByteArrayInputStream && inputStream.available() == 0;
}
/**
* Read a string off the input stream, as ASCII bytes
*
* @param length length of string to read
* @return String read from stream
* @throws IOException
*/
public String readString(final int length) throws IOException {
final byte[] buffer;
// Recycle single buffer if possible
if (length <= scratchBuffer.length) {
buffer = scratchBuffer;
} else {
buffer = new byte[length];
}
readBytes(buffer, 0, length);
final char[] charBuffer = new char[length];
for (int i = 0; i < length; ++i) {
charBuffer[i] = (char)buffer[i];
}
return new String(charBuffer);
}
/**
* Read ASCII bytes from the input stream until a null byte is read
*
* @return String constructed from the ASCII bytes read
* @throws IOException
*/
public String readNullTerminatedString() throws IOException {
final StringBuilder ret = new StringBuilder();
for (byte b = this.readByte(); b != 0; b = this.readByte()) {
ret.append((char)(b & 0xff));
}
return ret.toString();
}
/**
* Read an int length, and then a String of that length
*
* @param devourNull if true, the length include a null terminator, which is read and discarded
* @throws IOException
*/
public String readLengthAndString(final boolean devourNull) throws IOException {
int length = readInt();
if (devourNull) {
--length;
}
final String ret = readString(length);
if (devourNull) {
readByte();
}
return ret;
}
private void readByteBuffer(final int numBytes) throws IOException {
assert (numBytes <= byteBuffer.capacity());
readBytes(byteBuffer.array(), 0, numBytes);
byteBuffer.limit(byteBuffer.capacity());
byteBuffer.position(numBytes);
}
/**
* Read an int off the input stream
*
* @return int from input stream
* @throws IOException
*/
public int readInt() throws IOException {
readByteBuffer(4);
byteBuffer.flip();
return byteBuffer.getInt();
}
/**
* Reads a double off the input stream
*
* @return double
* @throws IOException
*/
public double readDouble() throws IOException {
readByteBuffer(8);
byteBuffer.flip();
return byteBuffer.getDouble();
}
/**
* Reads a long off the input stream
*
* @return long
* @throws IOException
*/
public long readLong() throws IOException {
readByteBuffer(8);
byteBuffer.flip();
return byteBuffer.getLong();
}
public short readShort() throws IOException {
readByteBuffer(2);
byteBuffer.flip();
return byteBuffer.getShort();
}
/**
* Reads a float off the input stream
*
* @return float
* @throws IOException
*/
public float readFloat() throws IOException {
readByteBuffer(4);
byteBuffer.flip();
return byteBuffer.getFloat();
}
/**
* Reads a boolean off the input stream, represented as a byte with value 1 or 0
*
* @return boolean
* @throws IOException
*/
public boolean readBoolean() throws IOException {
return ((readByte()) == 1);
}
/**
* Reads an 8-bit unsigned byte from the input stream. This method assumes little-endianness.
*
* @throws IOException
*/
public short readUByte() throws IOException {
readByteBuffer(1);
byteBuffer.put((byte)0);
byteBuffer.flip();
return byteBuffer.getShort();
}
/**
* Reads a 16-bit unsigned short from the input stream. This method assumes little-endianness.
*
* @throws IOException
*/
public int readUShort() throws IOException {
readByteBuffer(2);
byteBuffer.putShort((short)0);
byteBuffer.flip();
return byteBuffer.getInt();
}
/**
* Reads a 32-bit unsigned int from the input stream. This method assumes little-endianness.
*
* @throws IOException
*/
public long readUInt() throws IOException {
readByteBuffer(4);
byteBuffer.putInt(0);
byteBuffer.flip();
return byteBuffer.getLong();
}
/**
* Close the appropriate stream
*
* @throws IOException
*/
public void close() throws IOException {
if (this.isWriting) {
// To the degree possible, make sure the bytes get forced to the file system,
// or else cause an exception to be thrown.
if (this.outputStream instanceof FileOutputStream) {
this.outputStream.flush();
FileOutputStream fos = (FileOutputStream)this.outputStream;
try {
fos.getFD().sync();
} catch (SyncFailedException e) {
// ignore
}
}
this.outputStream.close();
} else {
this.inputStream.close();
}
}
private String constructErrorMessage(final String msg) {
final StringBuilder sb = new StringBuilder(msg);
sb.append("; BinaryCodec in ");
sb.append(isWriting ? "write" : "read");
sb.append("mode; ");
final String filename = isWriting ? outputFileName : inputFileName;
if (filename != null) {
sb.append("file: ");
sb.append(filename);
} else {
sb.append("streamed file (filename not available)");
}
return sb.toString();
}
public String getInputFileName() {
return inputFileName;
}
public String getOutputFileName() {
return outputFileName;
}
public void setOutputFileName(final String outputFileName) {
this.outputFileName = outputFileName;
}
public void setInputFileName(final String inputFileName) {
this.inputFileName = inputFileName;
}
public boolean isWriting() {
return isWriting;
}
public OutputStream getOutputStream() {
return outputStream;
}
public InputStream getInputStream() {
return inputStream;
}
public void setInputStream(final InputStream is) {
isWriting = false;
this.inputStream = is;
}
public void setOutputStream(final OutputStream os) {
isWriting = true;
this.outputStream = os;
}
}

View file

@ -0,0 +1,709 @@
package org.xbib.io.compress.bgzf;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
/**
* Stream class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
* It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
* The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
* entire file up to the location being sought. Note that seeking is only possible if the input stream is seekable.
*
* Note that this implementation is not synchronized. If multiple threads access an instance concurrently, it must be synchronized externally.
*
* @see <a href="http://samtools.sourceforge.net/SAM1.pdf">http://samtools.sourceforge.net/SAM1.pdf</a> for details of BGZF file format.
*/
public class BlockCompressedInputStream extends InputStream {
public final static String INCORRECT_HEADER_SIZE_MSG = "Incorrect header size for file: ";
public final static String UNEXPECTED_BLOCK_LENGTH_MSG = "Unexpected compressed block length: ";
public final static String PREMATURE_END_MSG = "Premature end of file: ";
public final static String CANNOT_SEEK_STREAM_MSG = "Cannot seek a position for a non-file stream";
public final static String CANNOT_SEEK_CLOSED_STREAM_MSG = "Cannot seek a position for a closed stream";
public final static String INVALID_FILE_PTR_MSG = "Invalid file pointer: ";
private InputStream mStream;
private boolean mIsClosed = false;
private SeekableStream mFile;
private byte[] mFileBuffer = null;
private DecompressedBlock mCurrentBlock = null;
private int mCurrentOffset = 0;
private long mStreamOffset = 0;
private final BlockGunzipper blockGunzipper;
private volatile ByteArrayOutputStream buf = null;
private static final byte eol = '\n';
private static final byte eolCr = '\r';
/**
* Note that seek() is not supported if this ctor is used.
* @param stream source of bytes
*/
public BlockCompressedInputStream(final InputStream stream) {
this(stream, true, BlockGunzipper.getDefaultInflaterFactory());
}
/**
* Note that seek() is not supported if this ctor is used.
* @param stream source of bytes
* @param inflaterFactory {@link InflaterFactory} used by {@link BlockGunzipper}
*/
public BlockCompressedInputStream(final InputStream stream, final InflaterFactory inflaterFactory) {
this(stream, true, inflaterFactory);
}
/**
* Note that seek() is not supported if this ctor is used.
* @param stream source of bytes
* @param allowBuffering if true, allow buffering
*/
public BlockCompressedInputStream(final InputStream stream, final boolean allowBuffering) {
this(stream, allowBuffering, BlockGunzipper.getDefaultInflaterFactory());
}
/**
* Note that seek() is not supported if this ctor is used.
* @param stream source of bytes
* @param allowBuffering if true, allow buffering
* @param inflaterFactory {@link InflaterFactory} used by {@link BlockGunzipper}
*/
public BlockCompressedInputStream(final InputStream stream, final boolean allowBuffering, final InflaterFactory inflaterFactory) {
if (allowBuffering) {
mStream = new BufferedInputStream(stream);
}
else {
mStream = stream;
}
mFile = null;
blockGunzipper = new BlockGunzipper(inflaterFactory);
}
/**
* Use this ctor if you wish to call seek()
* @param file source of bytes
* @throws IOException
*/
public BlockCompressedInputStream(final File file) throws IOException {
this(file, BlockGunzipper.getDefaultInflaterFactory());
}
/**
* Use this ctor if you wish to call seek()
* @param file source of bytes
* @param inflaterFactory {@link InflaterFactory} used by {@link BlockGunzipper}
* @throws IOException
*/
public BlockCompressedInputStream(final File file, final InflaterFactory inflaterFactory) throws IOException {
mFile = new SeekableFileStream(file);
mStream = null;
blockGunzipper = new BlockGunzipper(inflaterFactory);
}
/**
* For providing some arbitrary data source. No additional buffering is
* provided, so if the underlying source is not buffered, wrap it in a
* SeekableBufferedStream before passing to this ctor.
* @param strm source of bytes
*/
public BlockCompressedInputStream(final SeekableStream strm) {
this(strm, BlockGunzipper.getDefaultInflaterFactory());
}
/**
* For providing some arbitrary data source. No additional buffering is
* provided, so if the underlying source is not buffered, wrap it in a
* SeekableBufferedStream before passing to this ctor.
* @param strm source of bytes
* @param inflaterFactory {@link InflaterFactory} used by {@link BlockGunzipper}
*/
public BlockCompressedInputStream(final SeekableStream strm, final InflaterFactory inflaterFactory) {
mFile = strm;
mStream = null;
blockGunzipper = new BlockGunzipper(inflaterFactory);
}
/**
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
* and check it against the value stored in the GZIP header. CRC checking is an expensive
* operation and should be used accordingly.
*/
public void setCheckCrcs(final boolean check) {
this.blockGunzipper.setCheckCrcs(check);
}
/**
* @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
* next caller of a method for this input stream. The next caller might be the same thread or another thread.
* Note that although the next caller can read this many bytes without blocking, the available() method call itself
* may block in order to fill an internal buffer if it has been exhausted.
*/
@Override
public int available() throws IOException {
if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.mBlock.length) {
readBlock();
}
if (mCurrentBlock == null) {
return 0;
}
return mCurrentBlock.mBlock.length - mCurrentOffset;
}
/**
* @return <code>true</code> if the stream is at the end of a BGZF block,
* <code>false</code> otherwise.
*/
public boolean endOfBlock() {
return (mCurrentBlock != null && mCurrentOffset == mCurrentBlock.mBlock.length);
}
/**
* Closes the underlying InputStream or RandomAccessFile
*/
@Override
public void close() throws IOException {
if (mFile != null) {
mFile.close();
mFile = null;
} else if (mStream != null) {
mStream.close();
mStream = null;
}
// Encourage garbage collection
mFileBuffer = null;
mCurrentBlock = null;
// Mark as closed
mIsClosed = true;
}
/**
* Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
* If no byte is available because the end of the stream has been reached, the value -1 is returned.
* This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
* @return the next byte of data, or -1 if the end of the stream is reached.
*/
@Override
public int read() throws IOException {
return (available() > 0) ? (mCurrentBlock.mBlock[mCurrentOffset++] & 0xFF) : -1;
}
/**
* Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
* actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
* or an exception is thrown.
*
* read(buf) has the same effect as read(buf, 0, buf.length).
*
* @param buffer the buffer into which the data is read.
* @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
* the stream has been reached.
*/
@Override
public int read(final byte[] buffer) throws IOException {
return read(buffer, 0, buffer.length);
}
/**
* Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
* carriage return ('\r') or carriage return followed by a line feed ("\r\n").
*
* @return A String containing the contents of the line, excluding the line terminating
* character, or null if the end of the stream has been reached
*
* @exception IOException If an I/O error occurs
*/
public String readLine() throws IOException {
int available = available();
if (available == 0) {
return null;
}
if(null == buf){ // lazy initialisation
buf = new ByteArrayOutputStream(8192);
}
buf.reset();
boolean done = false;
boolean foundCr = false; // \r found flag
while (!done) {
int linetmpPos = mCurrentOffset;
int bCnt = 0;
while((available-- > 0)){
final byte c = mCurrentBlock.mBlock[linetmpPos++];
if(c == eol){ // found \n
done = true;
break;
} else if(foundCr){ // previous char was \r
--linetmpPos; // current char is not \n so put it back
done = true;
break;
} else if(c == eolCr){ // found \r
foundCr = true;
continue; // no ++bCnt
}
++bCnt;
}
if(mCurrentOffset < linetmpPos) {
buf.write(mCurrentBlock.mBlock, mCurrentOffset, bCnt);
mCurrentOffset = linetmpPos;
}
available = available();
if(available == 0) {
// EOF
done = true;
}
}
return buf.toString();
}
/**
* Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
* as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
*
* This method blocks until input data is available, end of file is detected, or an exception is thrown.
*
* @param buffer buffer into which data is read.
* @param offset the start offset in array b at which the data is written.
* @param length the maximum number of bytes to read.
* @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
* the stream has been reached.
*/
@Override
public int read(final byte[] buffer, int offset, int length) throws IOException {
final int originalLength = length;
while (length > 0) {
final int available = available();
if (available == 0) {
// Signal EOF to caller
if (originalLength == length) {
return -1;
}
break;
}
final int copyLength = Math.min(length, available);
System.arraycopy(mCurrentBlock.mBlock, mCurrentOffset, buffer, offset, copyLength);
mCurrentOffset += copyLength;
offset += copyLength;
length -= copyLength;
}
return originalLength - length;
}
/**
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
* not an actual byte offset.
*
* @param pos virtual file pointer position
* @throws IOException if stream is closed or not a file based stream
*/
public void seek(final long pos) throws IOException {
// Must be before the mFile == null check because mFile == null for closed files and streams
if (mIsClosed) {
throw new IOException(CANNOT_SEEK_CLOSED_STREAM_MSG);
}
// Cannot seek on streams that are not file based
if (mFile == null) {
throw new IOException(CANNOT_SEEK_STREAM_MSG);
}
// Decode virtual file pointer
// Upper 48 bits is the byte offset into the compressed stream of a
// block.
// Lower 16 bits is the byte offset into the uncompressed stream inside
// the block.
final long compressedOffset = BGZFFilePointerUtil.getBlockAddress(pos);
final int uncompressedOffset = BGZFFilePointerUtil.getBlockOffset(pos);
final int available;
if (mCurrentBlock != null && mCurrentBlock.mBlockAddress == compressedOffset) {
available = mCurrentBlock.mBlock.length;
} else {
prepareForSeek();
mFile.seek(compressedOffset);
mStreamOffset = compressedOffset;
mCurrentBlock = nextBlock(getBufferForReuse(mCurrentBlock));
mCurrentOffset = 0;
available = available();
}
if (uncompressedOffset > available || (uncompressedOffset == available && !eof())) {
throw new IOException(INVALID_FILE_PTR_MSG + pos + " for " + getSource());
}
mCurrentOffset = uncompressedOffset;
}
/**
* Performs cleanup required before seek is called on the underlying stream
*/
protected void prepareForSeek() {
}
private boolean eof() throws IOException {
if (mFile.eof()) {
return true;
}
// If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
return (mFile.length() - (mCurrentBlock.mBlockAddress
+ mCurrentBlock.mBlockCompressedSize) == BGZFStreamConstants.EMPTY_GZIP_BLOCK.length);
}
/**
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
* the two.
*/
public long getFilePointer() {
if (mCurrentBlock == null) {
// Haven't read anything yet = at start of stream
return BGZFFilePointerUtil.makeFilePointer(0, 0);
}
if (mCurrentOffset > 0 && mCurrentOffset == mCurrentBlock.mBlock.length) {
// If current offset is at the end of the current block, file
// pointer should point
// to the beginning of the next block.
return BGZFFilePointerUtil.makeFilePointer(mCurrentBlock.mBlockAddress + mCurrentBlock.mBlockCompressedSize, 0);
}
return BGZFFilePointerUtil.makeFilePointer(mCurrentBlock.mBlockAddress, mCurrentOffset);
}
public long getPosition() {
return getFilePointer();
}
public static long getFileBlock(final long bgzfOffset) {
return BGZFFilePointerUtil.getBlockAddress(bgzfOffset);
}
/**
* @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
* @return true if the given file looks like a valid BGZF file.
*/
public static boolean isValidFile(final InputStream stream) throws IOException {
if (!stream.markSupported()) {
throw new RuntimeException("Cannot test non-buffered stream");
}
stream.mark(BGZFStreamConstants.BLOCK_HEADER_LENGTH);
final byte[] buffer = new byte[BGZFStreamConstants.BLOCK_HEADER_LENGTH];
final int count = readBytes(stream, buffer, 0, BGZFStreamConstants.BLOCK_HEADER_LENGTH);
stream.reset();
return count == BGZFStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
}
private static boolean isValidBlockHeader(final byte[] buffer) {
return (buffer[0] == BGZFStreamConstants.GZIP_ID1 &&
(buffer[1] & 0xFF) == BGZFStreamConstants.GZIP_ID2 &&
(buffer[3] & BGZFStreamConstants.GZIP_FLG) != 0 &&
buffer[10] == BGZFStreamConstants.GZIP_XLEN &&
buffer[12] == BGZFStreamConstants.BGZF_ID1 &&
buffer[13] == BGZFStreamConstants.BGZF_ID2);
}
private void readBlock() throws IOException {
mCurrentBlock = nextBlock(getBufferForReuse(mCurrentBlock));
mCurrentOffset = 0;
checkAndRethrowDecompressionException();
}
/**
* Reads and decompresses the next block
* @param bufferAvailableForReuse decompression buffer available for reuse
* @return next block in the decompressed stream
*/
protected DecompressedBlock nextBlock(byte[] bufferAvailableForReuse) {
return processNextBlock(bufferAvailableForReuse);
}
/**
* Rethrows an exception encountered during decompression
* @throws IOException
*/
private void checkAndRethrowDecompressionException() throws IOException {
if (mCurrentBlock.mException != null) {
if (mCurrentBlock.mException instanceof IOException) {
throw (IOException) mCurrentBlock.mException;
} else if (mCurrentBlock.mException instanceof RuntimeException) {
throw (RuntimeException) mCurrentBlock.mException;
} else {
throw new RuntimeException(mCurrentBlock.mException);
}
}
}
/**
* Attempt to reuse the buffer of the given block
* @param block owning block
* @return null decompressing buffer to reuse, null if no buffer is available
*/
private byte[] getBufferForReuse(DecompressedBlock block) {
if (block == null) return null;
return block.mBlock;
}
/**
* Decompress the next block from the input stream. When using asynchronous
* IO, this will be called by the background thread.
* @param bufferAvailableForReuse buffer in which to place decompressed block. A null or
* incorrectly sized buffer will result in the buffer being ignored and
* a new buffer allocated for decompression.
* @return next block in input stream
*/
protected DecompressedBlock processNextBlock(byte[] bufferAvailableForReuse) {
if (mFileBuffer == null) {
mFileBuffer = new byte[BGZFStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
}
long blockAddress = mStreamOffset;
try {
final int headerByteCount = readBytes(mFileBuffer, 0, BGZFStreamConstants.BLOCK_HEADER_LENGTH);
mStreamOffset += headerByteCount;
if (headerByteCount == 0) {
// Handle case where there is no empty gzip block at end.
return new DecompressedBlock(blockAddress, new byte[0], 0);
}
if (headerByteCount != BGZFStreamConstants.BLOCK_HEADER_LENGTH) {
return new DecompressedBlock(blockAddress, headerByteCount, new IOException(INCORRECT_HEADER_SIZE_MSG + getSource()));
}
final int blockLength = unpackInt16(mFileBuffer, BGZFStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
if (blockLength < BGZFStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
return new DecompressedBlock(blockAddress, blockLength,
new IOException(UNEXPECTED_BLOCK_LENGTH_MSG + blockLength + " for " + getSource()));
}
final int remaining = blockLength - BGZFStreamConstants.BLOCK_HEADER_LENGTH;
final int dataByteCount = readBytes(mFileBuffer, BGZFStreamConstants.BLOCK_HEADER_LENGTH,
remaining);
mStreamOffset += dataByteCount;
if (dataByteCount != remaining) {
return new DecompressedBlock(blockAddress, blockLength,
new BGZFException(PREMATURE_END_MSG + getSource()));
}
final byte[] decompressed = inflateBlock(mFileBuffer, blockLength, bufferAvailableForReuse);
return new DecompressedBlock(blockAddress, decompressed, blockLength);
} catch (IOException e) {
return new DecompressedBlock(blockAddress, 0, e);
}
}
private byte[] inflateBlock(final byte[] compressedBlock, final int compressedLength,
final byte[] bufferAvailableForReuse) throws IOException {
final int uncompressedLength = unpackInt32(compressedBlock, compressedLength - 4);
if (uncompressedLength < 0) {
throw new BGZFException(getSource() + " has invalid uncompressedLength: " + uncompressedLength);
}
byte[] buffer = bufferAvailableForReuse;
if (buffer == null || uncompressedLength != buffer.length) {
// can't reuse the buffer since the size is incorrect
buffer = new byte[uncompressedLength];
}
blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
return buffer;
}
private String getSource() {
return mFile == null ? "data stream" : mFile.getSource();
}
private int readBytes(final byte[] buffer, final int offset, final int length) throws IOException {
if (mFile != null) {
return readBytes(mFile, buffer, offset, length);
} else if (mStream != null) {
return readBytes(mStream, buffer, offset, length);
} else {
return 0;
}
}
private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length) throws IOException {
int bytesRead = 0;
while (bytesRead < length) {
final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
if (count <= 0) {
break;
}
bytesRead += count;
}
return bytesRead;
}
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) throws IOException {
int bytesRead = 0;
while (bytesRead < length) {
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
if (count <= 0) {
break;
}
bytesRead += count;
}
return bytesRead;
}
private int unpackInt16(final byte[] buffer, final int offset) {
return ((buffer[offset] & 0xFF) |
((buffer[offset+1] & 0xFF) << 8));
}
private int unpackInt32(final byte[] buffer, final int offset) {
return ((buffer[offset] & 0xFF) |
((buffer[offset+1] & 0xFF) << 8) |
((buffer[offset+2] & 0xFF) << 16) |
((buffer[offset+3] & 0xFF) << 24));
}
public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
/**
*
* @param file the file to check
* @return status of the last compressed block
* @throws IOException
*/
public static FileTermination checkTermination(final File file) throws IOException {
return checkTermination(file.toPath());
}
/**
*
* @param path to the file to check
* @return status of the last compressed block
* @throws IOException
*/
public static FileTermination checkTermination(final Path path) throws IOException {
try( final SeekableByteChannel channel = Files.newByteChannel(path, StandardOpenOption.READ) ){
return checkTermination(channel);
}
}
/**
* check the status of the final bzgipped block for the given bgzipped resource
*
* @param channel an open channel to read from,
* the channel will remain open and the initial position will be restored when the operation completes
* this makes no guarantee about the state of the channel if an exception is thrown during reading
*
* @return the status of the last compressed black
* @throws IOException
*/
public static FileTermination checkTermination(SeekableByteChannel channel) throws IOException {
final long fileSize = channel.size();
if (fileSize < BGZFStreamConstants.EMPTY_GZIP_BLOCK.length) {
return FileTermination.DEFECTIVE;
}
final long initialPosition = channel.position();
boolean exceptionThrown = false;
try {
channel.position(fileSize - BGZFStreamConstants.EMPTY_GZIP_BLOCK.length);
//Check if the end of the file is an empty gzip block which is used as the terminator for a bgzipped file
final ByteBuffer lastBlockBuffer = ByteBuffer.allocate(BGZFStreamConstants.EMPTY_GZIP_BLOCK.length);
readFully(channel, lastBlockBuffer);
if (Arrays.equals(lastBlockBuffer.array(), BGZFStreamConstants.EMPTY_GZIP_BLOCK)) {
return FileTermination.HAS_TERMINATOR_BLOCK;
}
//if the last block isn't an empty gzip block, check to see if it is a healthy compressed block or if it's corrupted
final int bufsize = (int) Math.min(fileSize, BGZFStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
final byte[] bufferArray = new byte[bufsize];
channel.position(fileSize - bufsize);
readFully(channel, ByteBuffer.wrap(bufferArray));
for (int i = bufferArray.length - BGZFStreamConstants.EMPTY_GZIP_BLOCK.length;
i >= 0; --i) {
if (!preambleEqual(BGZFStreamConstants.GZIP_BLOCK_PREAMBLE,
bufferArray, i, BGZFStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
continue;
}
final ByteBuffer byteBuffer = ByteBuffer.wrap(bufferArray,
i + BGZFStreamConstants.GZIP_BLOCK_PREAMBLE.length,
4);
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
if (bufferArray.length - i == totalBlockSizeMinusOne + 1) {
return FileTermination.HAS_HEALTHY_LAST_BLOCK;
} else {
return FileTermination.DEFECTIVE;
}
}
return FileTermination.DEFECTIVE;
} catch (final Throwable e) {
exceptionThrown = true;
throw e;
} finally {
//if an exception was thrown we don't want to reset the position because that would be likely to throw again
//and suppress the initial exception
if(!exceptionThrown) {
channel.position(initialPosition);
}
}
}
/**
* read as many bytes as dst's capacity into dst or throw if that's not possible
*
* @throws EOFException if channel has fewer bytes available than dst's capacity
*/
static void readFully(SeekableByteChannel channel, ByteBuffer dst) throws IOException {
int totalBytesRead = 0;
final int capacity = dst.capacity();
while (totalBytesRead < capacity) {
final int bytesRead = channel.read(dst);
if (bytesRead == -1) {
throw new EOFException();
}
totalBytesRead += bytesRead;
}
}
public static void assertNonDefectiveFile(final File file) throws IOException {
if (checkTermination(file) == FileTermination.DEFECTIVE) {
throw new BGZFException(file.getAbsolutePath() + " does not have a valid GZIP block at the end of the file.");
}
}
private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
for (int i = 0; i < length; ++i) {
if (preamble[i] != buf[i + startOffset]) {
return false;
}
}
return true;
}
protected static class DecompressedBlock {
/**
* Decompressed block
*/
private final byte[] mBlock;
/**
* Compressed size of block (the uncompressed size can be found using
* mBlock.length)
*/
private final int mBlockCompressedSize;
/**
* Stream offset of start of block
*/
private final long mBlockAddress;
/**
* Exception thrown (if any) when attempting to decompress block
*/
private final Exception mException;
public DecompressedBlock(long blockAddress, byte[] block, int compressedSize) {
mBlock = block;
mBlockAddress = blockAddress;
mBlockCompressedSize = compressedSize;
mException = null;
}
public DecompressedBlock(long blockAddress, int compressedSize, Exception exception) {
mBlock = new byte[0];
mBlockAddress = blockAddress;
mBlockCompressedSize = compressedSize;
mException = exception;
}
}
}

View file

@ -0,0 +1,358 @@
package org.xbib.io.compress.bgzf;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.zip.CRC32;
import java.util.zip.Deflater;
/**
* Stream class for a file that is a series of gzip blocks (BGZF format). The caller just treats it as an
* OutputStream, and under the covers a gzip block is written when the amount of uncompressed as-yet-unwritten
* bytes reaches a threshold.
*
* The advantage of BGZF over conventional gzip is that BGZF allows for seeking without having to scan through
* the entire file up to the position being sought.
*
* Note that the flush() method should not be called by client
* unless you know what you're doing, because it forces a gzip block to be written even if the
* number of buffered bytes has not reached threshold. close(), on the other hand, must be called
* when done writing in order to force the last gzip block to be written.
*
* @see <a href="http://samtools.sourceforge.net/SAM1.pdf">http://samtools.sourceforge.net/SAM1.pdf</a> for details of BGZF file format.
*/
public class BlockCompressedOutputStream extends OutputStream {
private static int defaultCompressionLevel = BGZFStreamConstants.DEFAULT_COMPRESSION_LEVEL;
private static DeflaterFactory defaultDeflaterFactory = new DeflaterFactory();
public static void setDefaultCompressionLevel(final int compressionLevel) {
if (compressionLevel < Deflater.NO_COMPRESSION || compressionLevel > Deflater.BEST_COMPRESSION) {
throw new IllegalArgumentException("Invalid compression level: " + compressionLevel);
}
defaultCompressionLevel = compressionLevel;
}
public static int getDefaultCompressionLevel() {
return defaultCompressionLevel;
}
/**
* Sets the default {@link DeflaterFactory} that will be used for all instances unless specified otherwise in the constructor.
* If this method is not called the default is a factory that will create the JDK {@link Deflater}.
* @param deflaterFactory non-null default factory.
*/
public static void setDefaultDeflaterFactory(final DeflaterFactory deflaterFactory) {
if (deflaterFactory == null) {
throw new IllegalArgumentException("null deflaterFactory");
}
defaultDeflaterFactory = deflaterFactory;
}
public static DeflaterFactory getDefaultDeflaterFactory() {
return defaultDeflaterFactory;
}
private final BinaryCodec codec;
private final byte[] uncompressedBuffer = new byte[BGZFStreamConstants.DEFAULT_UNCOMPRESSED_BLOCK_SIZE];
private int numUncompressedBytes = 0;
private final byte[] compressedBuffer =
new byte[BGZFStreamConstants.MAX_COMPRESSED_BLOCK_SIZE -
BGZFStreamConstants.BLOCK_HEADER_LENGTH];
private final Deflater deflater;
// A second deflater is created for the very unlikely case where the regular deflation actually makes
// things bigger, and the compressed block is too big. It should be possible to downshift the
// primary deflater to NO_COMPRESSION level, recompress, and then restore it to its original setting,
// but in practice that doesn't work.
// The motivation for deflating at NO_COMPRESSION level is that it will predictably produce compressed
// output that is 10 bytes larger than the input, and the threshold at which a block is generated is such that
// the size of tbe final gzip block will always be <= 64K. This is preferred over the previous method,
// which would attempt to compress up to 64K bytes, and if the resulting compressed block was too large,
// try compressing fewer input bytes (aka "downshifting'). The problem with downshifting is that
// getFilePointer might return an inaccurate value.
// I assume (AW 29-Oct-2013) that there is no value in using hardware-assisted deflater for no-compression mode,
// so just use JDK standard.
private final Deflater noCompressionDeflater = new Deflater(Deflater.NO_COMPRESSION, true);
private final CRC32 crc32 = new CRC32();
private Path file = null;
private long mBlockAddress = 0;
/**
* Uses default compression level, which is 5 unless changed by setCompressionLevel
* Note: this constructor uses the default {@link DeflaterFactory}, see {@link #getDefaultDeflaterFactory()}.
* Use {@link #BlockCompressedOutputStream(File, int, DeflaterFactory)} to specify a custom factory.
*/
public BlockCompressedOutputStream(final String filename) throws FileNotFoundException {
this(filename, defaultCompressionLevel);
}
/**
* Uses default compression level, which is 5 unless changed by setCompressionLevel
* Note: this constructor uses the default {@link DeflaterFactory}, see {@link #getDefaultDeflaterFactory()}.
* Use {@link #BlockCompressedOutputStream(File, int, DeflaterFactory)} to specify a custom factory.
*/
public BlockCompressedOutputStream(final File file) throws FileNotFoundException {
this(file, defaultCompressionLevel);
}
public BlockCompressedOutputStream(final String filename, final int compressionLevel) throws FileNotFoundException {
this(new File(filename), compressionLevel);
}
public BlockCompressedOutputStream(final File file, final int compressionLevel) throws FileNotFoundException {
this(file, compressionLevel, defaultDeflaterFactory);
}
public BlockCompressedOutputStream(final File file, final int compressionLevel, final DeflaterFactory deflaterFactory) throws FileNotFoundException {
this.file = file.toPath();
codec = new BinaryCodec(file, true);
deflater = deflaterFactory.makeDeflater(compressionLevel, true);
}
/**
* Uses default compression level, which is 5 unless changed by setCompressionLevel
* Note: this constructor uses the default {@link DeflaterFactory}, see {@link #getDefaultDeflaterFactory()}.
* Use {@link #BlockCompressedOutputStream(OutputStream, File, int, DeflaterFactory)} to specify a custom factory.
*/
public BlockCompressedOutputStream(final OutputStream os) {
this(os, (File)null, defaultCompressionLevel);
}
/**
* Uses default compression level, which is 5 unless changed by setCompressionLevel
* Note: this constructor uses the default {@link DeflaterFactory}, see {@link #getDefaultDeflaterFactory()}.
* Use {@link #BlockCompressedOutputStream(OutputStream, File, int, DeflaterFactory)} to specify a custom factory.
*
* @param file may be null
*/
public BlockCompressedOutputStream(final OutputStream os, final Path file) {
this(os, file, defaultCompressionLevel);
}
/**
* Note: this constructor uses the default {@link DeflaterFactory}, see {@link #getDefaultDeflaterFactory()}.
* Use {@link #BlockCompressedOutputStream(OutputStream, File, int, DeflaterFactory)} to specify a custom factory.
*/
public BlockCompressedOutputStream(final OutputStream os, final File file, final int compressionLevel) {
this(os, file, compressionLevel, defaultDeflaterFactory);
}
/**
* Note: this constructor uses the default {@link DeflaterFactory}, see {@link #getDefaultDeflaterFactory()}.
* Use {@link #BlockCompressedOutputStream(OutputStream, File, int, DeflaterFactory)} to specify a custom factory.
*/
public BlockCompressedOutputStream(final OutputStream os, final Path file, final int compressionLevel) {
this(os, file, compressionLevel, defaultDeflaterFactory);
}
/**
* Creates the output stream.
* @param os output stream to create a BlockCompressedOutputStream from
* @param file file to which to write the output or null if not available
* @param compressionLevel the compression level (0-9)
* @param deflaterFactory custom factory to create deflaters (overrides the default)
*/
public BlockCompressedOutputStream(final OutputStream os, final File file, final int compressionLevel, final DeflaterFactory deflaterFactory) {
this(os, file != null ? file.toPath() : null, compressionLevel, deflaterFactory);
}
/**
* Creates the output stream.
* @param os output stream to create a BlockCompressedOutputStream from
* @param file file to which to write the output or null if not available
* @param compressionLevel the compression level (0-9)
* @param deflaterFactory custom factory to create deflaters (overrides the default)
*/
public BlockCompressedOutputStream(final OutputStream os, final Path file, final int compressionLevel, final DeflaterFactory deflaterFactory) {
this.file = file;
codec = new BinaryCodec(os);
if (file != null) {
codec.setOutputFileName(file.toAbsolutePath().toUri().toString());
}
deflater = deflaterFactory.makeDeflater(compressionLevel, true);
}
/**
* @param output May or not already be a BlockCompressedOutputStream.
* @return A BlockCompressedOutputStream, either by wrapping the given OutputStream, or by casting if it already
* is a BCOS.
*/
public static BlockCompressedOutputStream maybeBgzfWrapOutputStream(OutputStream output) {
if (!(output instanceof BlockCompressedOutputStream)) {
return new BlockCompressedOutputStream(output);
} else {
return (BlockCompressedOutputStream)output;
}
}
/**
* Writes b.length bytes from the specified byte array to this output stream. The general contract for write(b)
* is that it should have exactly the same effect as the call write(b, 0, b.length).
* @param bytes the data
*/
@Override
public void write(final byte[] bytes) throws IOException {
write(bytes, 0, bytes.length);
}
/**
* Writes len bytes from the specified byte array starting at offset off to this output stream. The general
* contract for write(b, off, len) is that some of the bytes in the array b are written to the output stream in order;
* element b[off] is the first byte written and b[off+len-1] is the last byte written by this operation.
*
* @param bytes the data
* @param startIndex the start offset in the data
* @param numBytes the number of bytes to write
*/
@Override
public void write(final byte[] bytes, int startIndex, int numBytes) throws IOException {
while (numBytes > 0) {
final int bytesToWrite = Math.min(uncompressedBuffer.length - numUncompressedBytes, numBytes);
System.arraycopy(bytes, startIndex, uncompressedBuffer, numUncompressedBytes, bytesToWrite);
numUncompressedBytes += bytesToWrite;
startIndex += bytesToWrite;
numBytes -= bytesToWrite;
if (numUncompressedBytes == uncompressedBuffer.length) {
deflateBlock();
}
}
}
@Override
public void write(final int b) throws IOException {
uncompressedBuffer[numUncompressedBytes++] = (byte) b;
if (numUncompressedBytes == uncompressedBuffer.length) deflateBlock();
}
/**
* WARNING: flush() affects the output format, because it causes the current contents of uncompressedBuffer
* to be compressed and written, even if it isn't full. Unless you know what you're doing, don't call flush().
* Instead, call close(), which will flush any unwritten data before closing the underlying stream.
*
*/
@Override
public void flush() throws IOException {
while (numUncompressedBytes > 0) {
deflateBlock();
}
codec.getOutputStream().flush();
}
/**
* close() must be called in order to flush any remaining buffered bytes. An unclosed file will likely be
* defective.
*
*/
@Override
public void close() throws IOException {
close(true);
}
public void close(final boolean writeTerminatorBlock) throws IOException {
flush();
// For debugging...
// if (numberOfThrottleBacks > 0) {
// System.err.println("In BlockCompressedOutputStream, had to throttle back " + numberOfThrottleBacks +
// " times for file " + codec.getOutputFileName());
// }
if (writeTerminatorBlock) {
codec.writeBytes(BGZFStreamConstants.EMPTY_GZIP_BLOCK);
}
codec.close();
// If a terminator block was written, ensure that it's there and valid
if (writeTerminatorBlock) {
// Can't re-open something that is not a regular file, e.g. a named pipe or an output stream
if (this.file == null || !Files.isRegularFile(this.file)) return;
if (BlockCompressedInputStream.checkTermination(this.file) !=
BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) {
throw new IOException("Terminator block not found after closing BGZF file " + this.file);
}
}
}
/** Encode virtual file pointer
* Upper 48 bits is the byte offset into the compressed stream of a block.
* Lower 16 bits is the byte offset into the uncompressed stream inside the block.
*/
public long getFilePointer(){
return BGZFFilePointerUtil.makeFilePointer(mBlockAddress, numUncompressedBytes);
}
public long getPosition() {
return getFilePointer();
}
/**
* Attempt to write the data in uncompressedBuffer to the underlying file in a gzip block.
* If the entire uncompressedBuffer does not fit in the maximum allowed size, reduce the amount
* of data to be compressed, and slide the excess down in uncompressedBuffer so it can be picked
* up in the next deflate event.
* @return size of gzip block that was written.
*/
private int deflateBlock() throws IOException {
if (numUncompressedBytes == 0) {
return 0;
}
final int bytesToCompress = numUncompressedBytes;
// Compress the input
deflater.reset();
deflater.setInput(uncompressedBuffer, 0, bytesToCompress);
deflater.finish();
int compressedSize = deflater.deflate(compressedBuffer, 0, compressedBuffer.length);
// If it didn't all fit in compressedBuffer.length, set compression level to NO_COMPRESSION
// and try again. This should always fit.
if (!deflater.finished()) {
noCompressionDeflater.reset();
noCompressionDeflater.setInput(uncompressedBuffer, 0, bytesToCompress);
noCompressionDeflater.finish();
compressedSize = noCompressionDeflater.deflate(compressedBuffer, 0, compressedBuffer.length);
if (!noCompressionDeflater.finished()) {
throw new IllegalStateException("unpossible");
}
}
// Data compressed small enough, so write it out.
crc32.reset();
crc32.update(uncompressedBuffer, 0, bytesToCompress);
final int totalBlockSize = writeGzipBlock(compressedSize, bytesToCompress, crc32.getValue());
// Clear out from uncompressedBuffer the data that was written
numUncompressedBytes = 0;
mBlockAddress += totalBlockSize;
return totalBlockSize;
}
/**
* Writes the entire gzip block, assuming the compressed data is stored in compressedBuffer
* @return size of gzip block that was written.
*/
private int writeGzipBlock(final int compressedSize, final int uncompressedSize, final long crc) throws IOException {
// Init gzip header
codec.writeByte(BGZFStreamConstants.GZIP_ID1);
codec.writeByte(BGZFStreamConstants.GZIP_ID2);
codec.writeByte(BGZFStreamConstants.GZIP_CM_DEFLATE);
codec.writeByte(BGZFStreamConstants.GZIP_FLG);
codec.writeInt(0); // Modification time
codec.writeByte(BGZFStreamConstants.GZIP_XFL);
codec.writeByte(BGZFStreamConstants.GZIP_OS_UNKNOWN);
codec.writeShort(BGZFStreamConstants.GZIP_XLEN);
codec.writeByte(BGZFStreamConstants.BGZF_ID1);
codec.writeByte(BGZFStreamConstants.BGZF_ID2);
codec.writeShort(BGZFStreamConstants.BGZF_LEN);
final int totalBlockSize = compressedSize + BGZFStreamConstants.BLOCK_HEADER_LENGTH +
BGZFStreamConstants.BLOCK_FOOTER_LENGTH;
// I don't know why we store block size - 1, but that is what the spec says
codec.writeShort((short)(totalBlockSize - 1));
codec.writeBytes(compressedBuffer, 0, compressedSize);
codec.writeInt((int)crc);
codec.writeInt(uncompressedSize);
return totalBlockSize;
}
}

View file

@ -0,0 +1,114 @@
package org.xbib.io.compress.bgzf;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.zip.CRC32;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
/**
* For decompressing GZIP blocks that are already loaded into a byte[].
* The main advantage is that this object can be used over and over again to decompress many blocks,
* whereas a new GZIPInputStream and ByteArrayInputStream would otherwise need to be created for each
* block to be decompressed.
*
* This code requires that the GZIP header conform to the GZIP blocks written to BAM files, with
* a specific subfield and no other optional stuff.
*/
public class BlockGunzipper {
private static InflaterFactory defaultInflaterFactory = new InflaterFactory();
private final Inflater inflater;
private final CRC32 crc32 = new CRC32();
private boolean checkCrcs = false;
BlockGunzipper() {
inflater = defaultInflaterFactory.makeInflater(true); // GZIP mode
}
/**
* Create a BlockGunzipper using the provided inflaterFactory
* @param inflaterFactory
*/
BlockGunzipper(InflaterFactory inflaterFactory) {
inflater = inflaterFactory.makeInflater(true); // GZIP mode
}
/**
* Sets the default {@link InflaterFactory} that will be used for all instances unless specified otherwise in the constructor.
* If this method is not called the default is a factory that will create the JDK {@link Inflater}.
* @param inflaterFactory non-null default factory.
*/
public static void setDefaultInflaterFactory(final InflaterFactory inflaterFactory) {
if (inflaterFactory == null) {
throw new IllegalArgumentException("null inflaterFactory");
}
defaultInflaterFactory = inflaterFactory;
}
public static InflaterFactory getDefaultInflaterFactory() {
return defaultInflaterFactory;
}
/** Allows the caller to decide whether or not to check CRCs on when uncompressing blocks. */
public void setCheckCrcs(final boolean check) {
this.checkCrcs = check;
}
/**
* Decompress GZIP-compressed data
* @param uncompressedBlock must be big enough to hold decompressed output.
* @param compressedBlock compressed data starting at offset 0
* @param compressedLength size of compressed data, possibly less than the size of the buffer.
*/
void unzipBlock(byte[] uncompressedBlock, byte[] compressedBlock, int compressedLength) {
try {
ByteBuffer byteBuffer = ByteBuffer.wrap(compressedBlock, 0, compressedLength);
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
// Validate GZIP header
if (byteBuffer.get() != BGZFStreamConstants.GZIP_ID1 ||
byteBuffer.get() != (byte)BGZFStreamConstants.GZIP_ID2 ||
byteBuffer.get() != BGZFStreamConstants.GZIP_CM_DEFLATE ||
byteBuffer.get() != BGZFStreamConstants.GZIP_FLG
) {
throw new BGZFFormatException("Invalid GZIP header");
}
// Skip MTIME, XFL, OS fields
byteBuffer.position(byteBuffer.position() + 6);
if (byteBuffer.getShort() != BGZFStreamConstants.GZIP_XLEN) {
throw new BGZFFormatException("Invalid GZIP header");
}
// Skip blocksize subfield intro
byteBuffer.position(byteBuffer.position() + 4);
// Read ushort
final int totalBlockSize = (byteBuffer.getShort() & 0xffff) + 1;
if (totalBlockSize != compressedLength) {
throw new BGZFFormatException("GZIP blocksize disagreement");
}
// Read expected size and CRD from end of GZIP block
final int deflatedSize = compressedLength - BGZFStreamConstants.BLOCK_HEADER_LENGTH - BGZFStreamConstants.BLOCK_FOOTER_LENGTH;
byteBuffer.position(byteBuffer.position() + deflatedSize);
int expectedCrc = byteBuffer.getInt();
int uncompressedSize = byteBuffer.getInt();
inflater.reset();
// Decompress
inflater.setInput(compressedBlock, BGZFStreamConstants.BLOCK_HEADER_LENGTH, deflatedSize);
final int inflatedBytes = inflater.inflate(uncompressedBlock, 0, uncompressedSize);
if (inflatedBytes != uncompressedSize) {
throw new BGZFFormatException("Did not inflate expected amount");
}
// Validate CRC if so desired
if (this.checkCrcs) {
crc32.reset();
crc32.update(uncompressedBlock, 0, uncompressedSize);
final long crc = crc32.getValue();
if ((int)crc != expectedCrc) {
throw new BGZFFormatException("CRC mismatch");
}
}
} catch (DataFormatException e) {
throw new BGZFException(e);
}
}
}

View file

@ -0,0 +1,24 @@
package org.xbib.io.compress.bgzf;
import java.util.zip.Deflater;
/**
* Factory for {@link Deflater} objects used by {@link BlockCompressedOutputStream}.
* This class may be extended to provide alternative deflaters (e.g., for improved performance).
*/
public class DeflaterFactory {
public DeflaterFactory() {
//Note: made explicit constructor to make searching for references easier
}
/**
* Returns a deflater object that will be used when writing BAM files.
* Subclasses may override to provide their own deflater implementation.
* @param compressionLevel the compression level (0-9)
* @param gzipCompatible if true then use GZIP compatible compression
*/
public Deflater makeDeflater(final int compressionLevel, final boolean gzipCompatible) {
return new Deflater(compressionLevel, gzipCompatible);
}
}

View file

@ -0,0 +1,20 @@
package org.xbib.io.compress.bgzf;
import java.util.zip.Inflater;
/**
* Factory for {@link Inflater} objects used by {@link BlockGunzipper}.
* This class may be extended to provide alternative inflaters (e.g., for improved performance).
* The default implementation returns a JDK {@link Inflater}
*/
public class InflaterFactory {
/**
* Returns an inflater object that will be used when reading DEFLATE compressed files.
* Subclasses may override to provide their own inflater implementation.
* The default implementation returns a JDK {@link Inflater}
* @param gzipCompatible if true then use GZIP compatible compression
*/
public Inflater makeInflater(final boolean gzipCompatible) {
return new Inflater(gzipCompatible);
}
}

View file

@ -0,0 +1,105 @@
package org.xbib.io.compress.bgzf;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
/**
* A wrapper class to provide buffered read access to a SeekableStream. Just wrapping such a stream with
* a BufferedInputStream will not work as it does not support seeking. In this implementation a
* seek call is delegated to the wrapped stream, and the buffer reset.
*/
public class SeekableBufferedStream extends SeekableStream {
/** Little extension to buffered input stream to give access to the available bytes in the buffer. */
private static class ExtBufferedInputStream extends BufferedInputStream {
private ExtBufferedInputStream(final InputStream inputStream, final int i) {
super(inputStream, i);
}
/** Returns the number of bytes that can be read from the buffer without reading more into the buffer. */
int getBytesInBufferAvailable() {
if (this.count == this.pos) return 0; // documented test for "is buffer empty"
else return this.buf.length - this.pos;
}
}
public static final int DEFAULT_BUFFER_SIZE = 512000;
final private int bufferSize;
final SeekableStream wrappedStream;
ExtBufferedInputStream bufferedStream;
long position;
public SeekableBufferedStream(final SeekableStream stream, final int bufferSize) {
this.bufferSize = bufferSize;
this.wrappedStream = stream;
this.position = 0;
bufferedStream = new ExtBufferedInputStream(wrappedStream, bufferSize);
}
public SeekableBufferedStream(final SeekableStream stream) {
this(stream, DEFAULT_BUFFER_SIZE);
}
@Override
public long length() {
return wrappedStream.length();
}
@Override
public long skip(final long skipLength) throws IOException {
if (skipLength < this.bufferedStream.getBytesInBufferAvailable()) {
final long retval = this.bufferedStream.skip(skipLength);
this.position += retval;
return retval;
} else {
final long position = this.position + skipLength;
seek(position);
return skipLength;
}
}
@Override
public void seek(final long position) throws IOException {
this.position = position;
wrappedStream.seek(position);
bufferedStream = new ExtBufferedInputStream(wrappedStream, bufferSize);
}
@Override
public int read() throws IOException {
int b = bufferedStream.read();
position++;
return b;
}
@Override
public int read(final byte[] buffer, final int offset, final int length) throws IOException {
final int nBytesRead = bufferedStream.read(buffer, offset, length);
if (nBytesRead > 0) {
position += nBytesRead;
}
return nBytesRead;
}
@Override
public void close() throws IOException {
wrappedStream.close();
}
@Override
public boolean eof() throws IOException {
return position >= wrappedStream.length();
}
@Override
public String getSource() {
return wrappedStream.getSource();
}
@Override
public long position() throws IOException {
return position;
}
}

View file

@ -0,0 +1,110 @@
package org.xbib.io.compress.bgzf;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
public class SeekableFileStream extends SeekableStream {
/**
* Collection of all open instances. SeekableFileStream objects are usually open and kept open for the
* duration of a session. This collection supports a method to close them all.
*/
private static final Collection<SeekableFileStream> allInstances = Collections.synchronizedCollection(new HashSet<>());
File file;
RandomAccessFile fis;
public SeekableFileStream(final File file) throws FileNotFoundException {
this.file = file;
fis = new RandomAccessFile(file, "r");
allInstances.add(this);
}
@Override
public long length() {
return file.length();
}
@Override
public boolean eof() throws IOException {
return fis.length() == fis.getFilePointer();
}
@Override
public void seek(final long position) throws IOException {
fis.seek(position);
}
@Override
public long position() throws IOException {
return fis.getChannel().position();
}
@Override
public long skip(long n) throws IOException {
long initPos = position();
fis.getChannel().position(initPos + n);
return position() - initPos;
}
@Override
public int read(final byte[] buffer, final int offset, final int length) throws IOException {
if (length < 0) {
throw new IndexOutOfBoundsException();
}
int n = 0;
while (n < length) {
final int count = fis.read(buffer, offset + n, length - n);
if (count < 0) {
if (n > 0) {
return n;
} else {
return count;
}
}
n += count;
}
return n;
}
@Override
public int read() throws IOException {
return fis.read();
}
@Override
public int read(byte[] b) throws IOException {
return fis.read(b);
}
@Override
public String getSource() {
return file.getAbsolutePath();
}
@Override
public void close() throws IOException {
allInstances.remove(this);
fis.close();
}
public static synchronized void closeAllInstances() {
Collection<SeekableFileStream> clonedInstances = new HashSet<>(allInstances);
for (SeekableFileStream sfs : clonedInstances) {
try {
sfs.close();
} catch (IOException e) {
//
}
}
allInstances.clear();
}
}

View file

@ -0,0 +1,44 @@
package org.xbib.io.compress.bgzf;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
public abstract class SeekableStream extends InputStream {
public abstract long length();
public abstract long position() throws IOException;
public abstract void seek(long position) throws IOException;
@Override
public abstract int read(byte[] buffer, int offset, int length) throws IOException;
@Override
public abstract void close() throws IOException;
public abstract boolean eof() throws IOException;
/**
* @return String representation of source (e.g. URL, file path, etc.), or null if not available.
*/
public abstract String getSource();
/**
* Read enough bytes to fill the input buffer.
* @param b byte array
* @throws EOFException If EOF is reached before buffer is filled
*/
public void readFully(byte[] b) throws IOException {
int len = b.length;
int n = 0;
while (n < len) {
int count = read(b, n, len - n);
if (count < 0){
throw new EOFException();
}
n += count;
}
}
}

View file

@ -0,0 +1,11 @@
package org.xbib.io.compress.bgzf;
import java.io.File;
import java.io.IOException;
public class SeekableStreamFactory {
public static SeekableStream getStreamFor(String path) throws IOException {
return new SeekableFileStream(new File(path));
}
}

View file

@ -1,3 +1,4 @@
include 'io-compress-bgzf'
include 'io-compress-bzip2'
include 'io-compress-lzf'
include 'io-compress-xz'