Intro
(See the version 1.0.0.)
(See the GitHub repository.)
This time I have got rid of some generic classes and adapted the entire code base to deal with byte alphabet.
Code
package io.github.coderodde.compressor.app;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* This class implements a Huffman compressor for binary (byte-wise) data.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 14, 2025)
* @since 1.0.0 (Nov 14, 2025))
*/
public final class App {
/**
* The exit status for abnormal termination.
*/
private static final int EXIT_FAILURE = 1;
/**
* This extension is added to the compressed files.
*/
private static final String COMPRESSED_FILE_EXTENSION = ".huf";
public static void main(String[] args) {
try {
if (args.length == 1) {
compressFile(args[0]);
} else if (args.length == 2) {
decompressFile(args[0], args[1]);
} else {
printUsage();
}
} catch (final Exception ex) {
error(ex.getMessage());
System.exit(EXIT_FAILURE);
}
}
private static void printUsage() {
final String jarName = getJarFileName();
System.out.printf(
String.format(
"Usage: %s FILE - to compress FILE into FILE.huf\n",
jarName));
System.out.printf(
String.format(
" %s FILE.huf OUTPUT_FILE - " +
"to decompress FILE.huf into OUTPUT_FILE\n",
jarName));
}
private static void compressFile(final String inputFileName) throws IOException {
final File inputFile = new File(inputFileName);
if (!inputFile.exists()) {
error(String.format("The input file '%s' does not exist.\n",
inputFileName));
System.exit(EXIT_FAILURE);
}
if (inputFileName.endsWith(COMPRESSED_FILE_EXTENSION)) {
error(String.format("Input file '%s' already seems to be compressed.\n",
inputFileName));
System.exit(EXIT_FAILURE);
}
final Path path = inputFile.toPath();
long ta = System.currentTimeMillis();
final byte[] rawData = Files.readAllBytes(path);
long tb = System.currentTimeMillis();
info(String.format(
"Read all file bytes in %d milliseconds.\n", tb - ta));
ta = System.currentTimeMillis();
final byte[] compressedData = HuffmanByteCompressor.compress(rawData);
tb = System.currentTimeMillis();
info(String.format(
"Compressed the data in %d milliseconds.\n", tb - ta));
final File outputFile =
new File(inputFileName + COMPRESSED_FILE_EXTENSION);
ta = System.currentTimeMillis();
Files.write(outputFile.toPath(),
compressedData);
tb = System.currentTimeMillis();
info(String.format(
"Written the compressed data in %d milliseconds.\n", tb - ta));
}
private static void decompressFile(final String compressedFileName,
final String outputFileName) throws IOException {
final File compressedFile = new File(compressedFileName);
final File outputFile = new File(outputFileName);
if (!compressedFile.exists()) {
error(String.format("Compressed file '%s' does not exist.\n",
compressedFileName));
System.exit(EXIT_FAILURE);
}
if (!outputFile.exists()) {
outputFile.createNewFile();
}
final Path compressedFilePath = compressedFile.toPath();
long ta = System.currentTimeMillis();
final byte[] compressedData = Files.readAllBytes(compressedFilePath);
long tb = System.currentTimeMillis();
info(String.format(
"Read the compressed data in %d milliseconds.\n", tb - ta));
final Path outputFilePath = outputFile.toPath();
ta = System.currentTimeMillis();
final byte[] originalData =
HuffmanByteDecompressor.decompress(compressedData);
tb = System.currentTimeMillis();
info(String.format(
"Decompressed the data in %d milliseconds.\n", tb - ta));
ta = System.currentTimeMillis();
Files.write(outputFilePath, originalData);
tb = System.currentTimeMillis();
info(String.format(
"Written the decompressed data in %d milliseconds.\n",
tb - ta));
}
private static void error(final String message) {
System.err.printf("[ERROR] %s", message);
}
private static void info(final String message) {
System.out.printf("[INFO] %s", message);
}
/**
* Obtains the current name of this JAR-file.
*
* @return the name of this JAR-file.
*/
private static String getJarFileName() {
return new File(App.class.getProtectionDomain().getCodeSource()
.getLocation()
.getPath())
.getName();
}
}
package io.github.coderodde.compressor.app;
import java.util.Objects;
/**
* This class is responsible for decompressing the actual compressed data to the
* compression source data.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 17, 2025)
* @since 1.0.0 (Nov 17, 2025)
*/
public final class ByteArrayCompressedDataReader {
/**
* The resultant decompressed data.
*/
private final byte[] outputRawData;
/**
* The input compressed data.
*/
private final byte[] inputCompressedData;
/**
* The index of the bit where compressed data begins.
*/
private final long startingBitIndex;
/**
* The (Huffman) decoder tree.
*/
private final ByteHuffmanDecoderTree<Byte> decoderTree;
/**
* Constructs this compressed data reader/decompressor.
*
* @param outputRawData the resultant decompressed data.
* @param inputCompressedData the input compressed data.
* @param startingBitIndex the index of the first bit to decompress right
* after the header.
* @param decoderTree the decoder tree.
*/
public ByteArrayCompressedDataReader(final byte[] outputRawData,
final byte[] inputCompressedData,
final long startingBitIndex,
final ByteHuffmanDecoderTree<Byte>
decoderTree) {
this.outputRawData =
Objects.requireNonNull(
outputRawData,
"The output raw data is null");
this.inputCompressedData =
Objects.requireNonNull(
inputCompressedData,
"The input compressed data is null");
this.decoderTree =
Objects.requireNonNull(
decoderTree,
"The input decoder tree is null");
this.startingBitIndex = startingBitIndex;
}
/**
* Decompresses and reads the compressed data.
*/
public void read() {
final int totalBytes = outputRawData.length;
long currentBitIndex = startingBitIndex;
for (int byteIndex = 0;
byteIndex != totalBytes;
byteIndex++) {
final Byte dataByte = decoderTree.decode(inputCompressedData,
currentBitIndex);
outputRawData[byteIndex] = dataByte;
currentBitIndex += decoderTree.getPreviousCodeLength();
}
}
}
package io.github.coderodde.compressor.app;
import java.util.Objects;
/**
* This class is responsible for writing the actual compressed data to a byte
* array. This class does not handle the compression header; it is handled by
* {@link io.github.coderodde.compressor.app.ByteArrayHeaderWriter}.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 17, 2025)
* @since 1.0.0 (Nov 17, 2025)
*/
public final class ByteArrayCompressedDataWriter {
/**
* The target byte array to which all the compressed data will end up.
*/
private final byte[] compressedOutputData;
/**
* The actual data to be compressed.
*/
private final byte[] inputRawData;
/**
* The index of the first bit in the compressed data. We need this in order
* to omit the compression header.
*/
private final long startingBitIndex;
/**
* The actual (Huffman) code table.
*/
private final ByteHuffmanCodeTable codeTable;
/**
* Constructs this writer.
*
* @param compressedOutputData the compressed data byte array.
* @param inputRawData the input raw data byte array.
* @param startingBitIndex the starting bit index for the writing.
* @param codeTable the byte encoding table.
*/
public ByteArrayCompressedDataWriter(
final byte[] compressedOutputData,
final byte[] inputRawData,
final long startingBitIndex,
final ByteHuffmanCodeTable codeTable) {
this.compressedOutputData =
Objects.requireNonNull(
compressedOutputData,
"The output compressed data is null");
this.inputRawData =
Objects.requireNonNull(
inputRawData,
"The input raw data is null");
this.codeTable =
Objects.requireNonNull(
codeTable,
"The input code table is null");
this.startingBitIndex = startingBitIndex;
}
/**
* Writes the entire compressed data of {@code inputRawData}.
*/
public void write() {
long currentBitIndex = startingBitIndex;
for (final byte b : inputRawData) {
final CodeWord codeword = codeTable.get(b).reverse();
final int codewordLength = codeword.length();
writeCodeWord(compressedOutputData,
currentBitIndex,
codeword);
currentBitIndex += codewordLength;
}
}
/**
* Writes a single codeword to the compressed output data byte array.
*
* @param compressedOutputData the compressed output data byte array.
* @param currentBitIndex the current bit index.
* @param codeword the codeword to write.
*/
private static void writeCodeWord(final byte[] compressedOutputData,
final long currentBitIndex,
final CodeWord codeword) {
int byteIndex = (int) (currentBitIndex / Byte.SIZE);
int bitIndex = (int) (currentBitIndex % Byte.SIZE);
final int codewordLength = codeword.length();
for (int codewordBitIndex = 0;
codewordBitIndex < codewordLength;
codewordBitIndex++) {
if (codeword.get(codewordBitIndex)) {
setBit(compressedOutputData,
byteIndex,
bitIndex);
}
bitIndex++;
if (bitIndex == Byte.SIZE) {
bitIndex = 0;
byteIndex++;
}
}
}
/**
* Sets the {@code bitIndex}th bit in
* {@code compressedOutputData[byteIndex]}.
*
* @param compressedOutputData the compressed output data byte array.
* @param byteIndex the target byte index.
* @param bitIndex the target bit index.
*/
private static void setBit(final byte[] compressedOutputData,
final int byteIndex,
final int bitIndex) {
final byte mask = (byte)(1 << bitIndex);
compressedOutputData[byteIndex] |= mask;
}
}
package io.github.coderodde.compressor.app;
import static io.github.coderodde.compressor.app.Configuration.BYTES_PER_CODEWORD_MAX;
import static io.github.coderodde.compressor.app.Configuration.BYTES_PER_CODE_SIZE;
import static io.github.coderodde.compressor.app.Configuration.BYTES_PER_RAW_DATA_LENGTH;
import java.nio.ByteBuffer;
import java.util.Objects;
import java.nio.ByteOrder;
import java.util.Arrays;
/**
* This class implements the reader returning the file header data such as the
* length of the raw data being compressed and its decoding Huffman tree.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 16, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class ByteArrayHeaderReader {
/**
* The compressed data byte array containing the header.
*/
private final byte[] compressedData;
/**
* We cache this in order users of this class can query the length of the
* raw data that would result from decompression.
*/
private final int rawDataLength;
/**
* The code table being read.
*/
private final ByteHuffmanCodeTable codeTable;
public ByteArrayHeaderReader(final byte[] compressedData) {
this.compressedData =
Objects.requireNonNull(compressedData,
"The input compressed data is null");
final int codeTableSize = readCodeTableSize();
this.rawDataLength = readRawDataLength();
this.codeTable = readCodeTable(codeTableSize);
}
public int getRawDataLength() {
return rawDataLength;
}
public ByteHuffmanCodeTable getCodeTable() {
return codeTable;
}
private int readCodeTableSize() {
final byte[] codeTableSizeBytes = new byte[BYTES_PER_CODE_SIZE];
System.arraycopy(compressedData,
0,
codeTableSizeBytes,
0,
codeTableSizeBytes.length);
final ByteBuffer byteBuffer = ByteBuffer.wrap(codeTableSizeBytes);
return byteBuffer.order(ByteOrder.LITTLE_ENDIAN).getInt();
}
private int readRawDataLength() {
final byte[] rawDataLengthBytes = new byte[BYTES_PER_RAW_DATA_LENGTH];
System.arraycopy(compressedData,
BYTES_PER_CODE_SIZE,
rawDataLengthBytes,
0,
rawDataLengthBytes.length);
final ByteBuffer byteBuffer = ByteBuffer.wrap(rawDataLengthBytes);
return byteBuffer.order(ByteOrder.LITTLE_ENDIAN).getInt();
}
private ByteHuffmanCodeTable readCodeTable(final int codeTableSize) {
final ByteHuffmanCodeTable codeTable = new ByteHuffmanCodeTable();
final int codeEntryLength = Utils.getCodeEntryLength();
int byteCursor = BYTES_PER_CODE_SIZE + BYTES_PER_RAW_DATA_LENGTH;
for (int codeIndex = 0; codeIndex < codeTableSize; ++codeIndex) {
readCodeEntry(codeTable,
compressedData,
byteCursor);
byteCursor += codeEntryLength;
}
return codeTable;
}
private static void readCodeEntry(final ByteHuffmanCodeTable codeTable,
final byte[] compressedData,
final int byteCursor) {
final byte value = compressedData[byteCursor];
final byte length = compressedData[byteCursor + 1];
final byte[] codeEntryData =
Arrays.copyOfRange(compressedData,
byteCursor + 2,
byteCursor + 2 + BYTES_PER_CODEWORD_MAX);
final CodeWord codeword = inferCodeWord(length, codeEntryData);
codeTable.put(value, codeword);
}
private static CodeWord inferCodeWord(final int length,
final byte[] codeData) {
final int bits = ByteBuffer.wrap(codeData)
.order(ByteOrder.LITTLE_ENDIAN)
.getInt();
final CodeWord codeword = new CodeWord(length);
int mask = 1;
for (int i = 0; i < length; ++i) {
if ((bits & mask) != 0) {
codeword.set(i);
}
mask <<= 1;
}
return codeword;
}
}
package io.github.coderodde.compressor.app;
import static io.github.coderodde.compressor.app.Configuration.BYTES_PER_CODEWORD_MAX;
import static io.github.coderodde.compressor.app.Configuration.BYTES_PER_CODE_SIZE;
import static io.github.coderodde.compressor.app.Configuration.BYTES_PER_RAW_DATA_LENGTH;
import static io.github.coderodde.compressor.app.Configuration.CODE_TABLE_CAPACITY;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Objects;
/**
* This class writes the file header to the compressed file.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 16, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class ByteArrayHeaderWriter {
/**
* The minimum length of the raw data byte array.
*/
private static final int MINIMUM_RAW_DATA_LENGTH = 1;
/**
* The length of the raw data in bytes.
*/
private final int rawDataLength;
/**
* The output data containing the compressed file.
*/
private final byte[] outputData;
/**
* The index of the bit in the compressed data byte array at which writing
* compressed data must begin.
*/
private long dataStartBitIndex;
/**
* The code table to write to the compressed file header.
*/
private final ByteHuffmanCodeTable codeTable;
public ByteArrayHeaderWriter(final int rawDataLength,
final byte[] outputData,
final ByteHuffmanCodeTable codeTable) {
checkRawDataLength(rawDataLength);
Objects.requireNonNull(outputData, "The output data array is null");
Objects.requireNonNull(codeTable, "The input code table is null");
checkCodeTable(codeTable);
this.rawDataLength = rawDataLength;
this.outputData = outputData;
this.codeTable = codeTable;
}
public void write() {
writeCodeSize();
writeRawDataLength();
writeCodeTable();
}
public long getDataStartBitIndex() {
return dataStartBitIndex;
}
/**
* Writes the code size to the very first 32-bit integer of the compressed
* file.
*/
private void writeCodeSize() {
final byte[] codeSizeBytes =
ByteBuffer.allocate(BYTES_PER_CODE_SIZE)
.order(ByteOrder.LITTLE_ENDIAN)
.putInt(codeTable.size())
.array();
System.arraycopy(codeSizeBytes,
0,
outputData,
0,
codeSizeBytes.length);
}
/**
* Writes the length of the raw data file to the second 32-bit word in the
* compressed file.
*/
private void writeRawDataLength() {
final byte[] rawDataLengthBytes =
ByteBuffer.allocate(BYTES_PER_RAW_DATA_LENGTH)
.order(ByteOrder.LITTLE_ENDIAN)
.putInt(rawDataLength)
.array();
System.arraycopy(rawDataLengthBytes,
0,
outputData,
BYTES_PER_CODE_SIZE,
rawDataLengthBytes.length);
}
/**
* Writes the actual code table to the compressed file header to starting
* from the 8th byte.
*/
private void writeCodeTable() {
int currentByteIndex = BYTES_PER_CODE_SIZE
+ BYTES_PER_RAW_DATA_LENGTH;
for (int intValue = 0;
intValue < CODE_TABLE_CAPACITY;
intValue++) {
// Extract the least significant byte from 'intValue':
final byte value = (byte)(intValue & 0xff);
final CodeWord codeword = codeTable.get(value);
if (codeword != null) {
outputData[currentByteIndex++] = value;
outputData[currentByteIndex++] = (byte) codeword.length();
final byte[] codewordBytes = codeword.toByteArray();
System.arraycopy(codewordBytes,
0,
outputData,
currentByteIndex,
codewordBytes.length);
currentByteIndex += BYTES_PER_CODEWORD_MAX;
}
}
this.dataStartBitIndex = currentByteIndex * Byte.SIZE;
}
private static void checkRawDataLength(final int rawDataLength) {
if (rawDataLength < MINIMUM_RAW_DATA_LENGTH) {
throw new TooShortRawDataLengthException(
String.format(
"The length of the raw data is too small: %d. " +
"Must be at least %d.",
rawDataLength,
MINIMUM_RAW_DATA_LENGTH));
}
}
private static void checkCodeTable(final ByteHuffmanCodeTable codeTable) {
if (codeTable.isEmpty()) {
throw new EmptyCodeTableException();
}
}
}
package io.github.coderodde.compressor.app;
import static io.github.coderodde.compressor.app.Configuration.CODE_TABLE_CAPACITY;
/**
* This class models the byte frequency distributions.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 13, 2025)
* @since 1.0.0 (Nov 13, 2025)
*/
public final class ByteFrequencyDistribution {
/**
* The actual table mapping bytes to their frequencies.
*/
private final long[] frequencies = new long[CODE_TABLE_CAPACITY];
/**
* Returns the number of symbol/weight mappings.
*
* @return the size of this distribution.
*/
public int size() {
int sz = 0;
for (final long b : frequencies) {
if (b != 0) {
++sz;
}
}
return sz;
}
/**
* Returns {@code true} if and only if this distribution is empty.
*
* @return a Boolean flag indicating whether this distribution is empty.
*/
public boolean isEmpty() {
return size() == 0;
}
public void incrementFrequency(final byte value) {
++frequencies[Byte.toUnsignedInt(value)];
}
public long getFrequency(final byte value) {
return frequencies[Byte.toUnsignedInt(value)];
}
}
package io.github.coderodde.compressor.app;
import static io.github.coderodde.compressor.app.Configuration.CODE_TABLE_CAPACITY;
/**
* This class implements a Huffman code table over byte (8-bit) alphabet.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 20, 2025)
* @since 1.0.0 (Nov 20, 2025)
*/
public final class ByteHuffmanCodeTable {
/**
* The actual codeword table.
*/
private final CodeWord[] table = new CodeWord[CODE_TABLE_CAPACITY];
/**
* The number of byte/codeword mappings in this code table.
*/
private int size;
/**
* Associates the input byte {@code value} with the codeword
* {@code codeword}.
*
* @param value the byte key.
* @param codeword the value codeword.
*/
public void put(final byte value, final CodeWord codeword) {
table[Byte.toUnsignedInt(value)] = codeword;
++size;
}
/**
* Accesses the codeword associated with the byte value {@code value}.
*
* @param value the byte key.
* @return the associated codeword.
*/
public CodeWord get(final byte value) {
return table[Byte.toUnsignedInt(value)];
}
/**
* Returns the number of byte/codeword mappings in this code table.
*
* @return the size of this code table.
*/
public int size() {
return size;
}
/**
* Returns {@code true} if and only if this code table is empty.
*
* @return {@code true} if empty, {@code false} otherwise.
*/
public boolean isEmpty() {
return size == 0;
}
@Override
public boolean equals(final Object object) {
if (object == null) {
return false;
}
if (!getClass().equals(object.getClass())) {
return false;
}
if (object == this) {
return true;
}
final ByteHuffmanCodeTable other = (ByteHuffmanCodeTable) object;
if (size != other.size()) {
return false;
}
for (int i = 0; i < CODE_TABLE_CAPACITY; ++i) {
final CodeWord cw1 = get((byte) i);
final CodeWord cw2 = other.get((byte) i);
if (cw1 == null && cw2 == null) {
continue;
}
if (cw1 != null && cw2 == null) {
return false;
}
if (cw1 == null && cw2 != null) {
return false;
}
// Here, both cw1 and cw2 are not null:
if (!cw1.equals(cw2)) {
return false;
}
}
return true;
}
}
package io.github.coderodde.compressor.app;
import static io.github.coderodde.compressor.app.Configuration.CODE_TABLE_CAPACITY;
import java.util.HashSet;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
/**
* This class implements the Huffman code builder over weight distributions.
*
* @author Rodion "rodde" Efremov
* @version 2.0.1 (Nov 14, 2025)
* @since 1.0.0 (Nov 12, 2025)
*/
public final class ByteHuffmanCodeTableBuilder {
private ByteHuffmanCodeTableBuilder() {
// Hide constructor.
}
public static ByteHuffmanCodeTable
buildCode(final ByteFrequencyDistribution byteFrequencyDistribution) {
Objects.requireNonNull(byteFrequencyDistribution,
"The input byte frequency distribution is null");
if (byteFrequencyDistribution.isEmpty()) {
throw new IllegalArgumentException(
"The input byte frequency distribution is empty");
}
final ByteHuffmanCodeTable codeTable = new ByteHuffmanCodeTable();
final Queue<WeightedByteSet> queue = new PriorityQueue<>();
for (int i = 0; i < CODE_TABLE_CAPACITY; ++i) {
// Grab the 8 least significant bits of 'i':
final byte value = (byte)(i & 0xff);
final long frequency =
byteFrequencyDistribution.getFrequency(value);
if (frequency > 0L) {
// Once here, value is present in the compressed data:
final Set<Byte> set = new HashSet<>();
set.add(value);
codeTable.put(value, new CodeWord(0));
queue.add(new WeightedByteSet(set, frequency));
}
}
while (queue.size() > 1) {
final WeightedByteSet entry1 = queue.remove();
final WeightedByteSet entry2 = queue.remove();
for (final byte value : entry1.getSet()) {
codeTable.get(value).prependBit(true);
}
for (final byte value : entry2.getSet()) {
codeTable.get(value).prependBit(false);
}
entry1.getSet().addAll(entry2.getSet());
queue.add(new WeightedByteSet(
entry1.getSet(),
entry1.getTotalWeight() +
entry2.getTotalWeight()));
}
return codeTable;
}
}
package io.github.coderodde.compressor.app;
import java.util.Objects;
/**
* This class implements the Huffman decoding tree.
*
* @author Rodion "rodde" Efremov
* @param <S> the alphabet symbol type.
* @version 1.0.0 (Nov 14, 2025)
* @since 1.0.0 (Nov 14, 2025)
*/
public final class ByteHuffmanDecoderTree<S> {
/**
* This static inner class implements the Huffman decoding tree node.
*
* @param <S> the alphabet symbol type.
*/
private static final class TreeNode {
Byte value;
TreeNode zeroChild;
TreeNode oneChild;
}
/**
* The root node of the tree.
*/
private final TreeNode root = new TreeNode();
/**
* Caches the code length of the previously decoded symbol.
*/
private long previousCodeLength = -1L;
/**
* Constructs this Huffman decoding tree.
*
* @param codeTable the code table for which to construct the decoding tree.
*/
public ByteHuffmanDecoderTree(final ByteHuffmanCodeTable codeTable) {
Objects.requireNonNull(codeTable, "The input code table is null");
for (int value = 0;
value < Configuration.CODE_TABLE_CAPACITY;
value++) {
final CodeWord codeword = codeTable.get((byte) value);
if (codeword != null) {
final Byte byteValue = Byte.valueOf((byte) value);
insert(byteValue, codeword);
}
}
}
/**
* Decodes a codeword to a symbol.
*
* @param compressedData the compressed data for decompression.
* @param bitIndex the index of the starting bit of a codeword to
* scan.
* @return the encoded symbol.
*/
public byte decode(final byte[] compressedData, long bitIndex) {
Objects.requireNonNull(compressedData, "The input raw data is null");
previousCodeLength = 0;
TreeNode node = root;
while (node.value == null) {
final boolean bit = readBit(compressedData,
bitIndex);
node = bit ? node.oneChild : node.zeroChild;
++bitIndex;
++previousCodeLength;
}
return node.value;
}
public long getPreviousCodeLength() {
return previousCodeLength;
}
private static boolean readBit(final byte[] rawData, final long bitIndex) {
final int byteIndex = (int) (bitIndex / Byte.SIZE);
final byte targetByte = rawData[byteIndex];
final byte mask = (byte)(1 << (bitIndex % Byte.SIZE));
return ((mask & targetByte) != 0);
}
/**
* Inserts the symbol/codeword pair into this tree.
*
* @param symbol the symbol to insert.
* @param codeword the codeword associated with the input symbol.
*/
private void insert(final Byte value, final CodeWord codeword) {
TreeNode node = root;
for (int i = codeword.length() - 1; i >= 0; --i) {
final boolean bit = codeword.get(i);
if (bit) {
// Bit is set to 1.
if (node.oneChild == null) {
node.oneChild = new TreeNode();
}
node = node.oneChild;
} else {
// Bit is set to 0.
if (node.zeroChild == null) {
node.zeroChild = new TreeNode();
}
node = node.zeroChild;
}
}
if (node.value != null) {
throw new IllegalStateException(
String.format(
"Dublicate codeword [%s]: " +
"two symbols share the same code.",
codeword.toString()));
}
node.value = value;
}
}
package io.github.coderodde.compressor.app;
/**
* This class provides a method for building instances of
* {@link io.github.coderodde.compressor.app.ByteFrequencyDistribution} over byte-wise
* data.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 16, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class ByteWeightDistributionBuilder {
private ByteWeightDistributionBuilder() {
}
/**
* Builds and returns the weight distribution of the input raw data.
*
* @param rawData the byte array holding the data to compress.
*
* @return the weight distribution.
*/
public static ByteFrequencyDistribution
buildByteWeightDistribution(final byte[] rawData) {
final ByteFrequencyDistribution frequencyDistribution =
new ByteFrequencyDistribution();
for (final byte value : rawData) {
frequencyDistribution.incrementFrequency(value);
}
return frequencyDistribution;
}
}
package io.github.coderodde.compressor.app;
import java.util.BitSet;
/**
* This class implements a <b>binary</b> code word in data compression
* scenarios.
*
* @author Rodion "rodde" Efremov
* @version 1.1.0 (Nov 13, 2025)
* @since 1.0.0 (Oct 28, 2025)
*/
public class CodeWord {
private int length;
private final BitSet bits;
public CodeWord(final int length) {
checkLength(length);
this.length = length;
this.bits = new BitSet(length);
}
public CodeWord reverse() {
final CodeWord reversed = new CodeWord(length);
for (int i = length - 1, j = 0; i >= 0; --i, ++j) {
if (get(i)) {
reversed.set(j);
}
}
return reversed;
}
public byte[] toByteArray() {
return bits.toByteArray();
}
public void prependBit(final boolean bit) {
if (bit) {
bits.set(length);
}
++length;
}
public int length() {
return length;
}
public boolean get(final int index) {
checkIndex(index);
return bits.get(index);
}
public void set(final int index) {
checkIndex(index);
bits.set(index);
}
public void unset(final int index) {
checkIndex(index);
bits.set(index, false);
}
@Override
public boolean equals(final Object object) {
if (object == this) {
return true;
}
if (object == null) {
return false;
}
if (!getClass().equals(object.getClass())) {
return false;
}
final CodeWord other = (CodeWord) object;
if (length() != other.length()) {
return false;
}
for (int i = 0; i < length(); ++i) {
if (get(i) != other.get(i)) {
return false;
}
}
return true;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder(length);
for (int i = length - 1; i >= 0; --i) {
sb.append(get(i) ? "1" : "0");
}
return sb.toString();
}
private void checkIndex(final int index) {
if (index < 0) {
throw new IndexOutOfBoundsException(
String.format("index(%d) < 0", index));
}
if (index >= this.length) {
throw new IndexOutOfBoundsException(
String.format("index(%d) >= length(%d)",
index,
length));
}
}
private static void checkLength(final int length) {
if (length < 0) {
throw new IllegalArgumentException(
String.format("length(%d) < 1", length));
}
}
}
package io.github.coderodde.compressor.app;
/**
* This class contains configuration constants.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 19, 2025)
* @since 1.0.0 (Nov 19, 2025)
*/
final class Configuration {
/**
* This capacity stands for {@code 2^8 = 256}, the number of distinct byte
* values.
*/
static final int CODE_TABLE_CAPACITY = 256;
/**
* Specifies how many bytes to use in order to communicate the size of the
* Huffman code.
*/
static final int BYTES_PER_CODE_SIZE = 4;
/**
* Specifies how many bytes to use in order to communicate the actual length
* (in bytes) of the input byte array.
*/
static final int BYTES_PER_RAW_DATA_LENGTH = 4;
/**
* Specifies how many bytes to reserve for describing the byte being
* encoded.
*/
static final int BYTES_PER_BYTE_DESCRIPTOR = 1;
/**
* Specifies how many bytes to reserve for signalling the codeword length.
*/
static final int BYTES_PER_CODEWORD_LENGTH = 1;
/**
* Specifies how many bytes to use for the codeword.
*/
static final int BYTES_PER_CODEWORD_MAX = 4;
private Configuration() {
}
}
package io.github.coderodde.compressor.app;
/**
* The instances of this class are thrown when dealing with empty code tables.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 16, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class EmptyCodeTableException extends RuntimeException {
public EmptyCodeTableException() {
super("The code table is empty");
}
}
package io.github.coderodde.compressor.app;
import java.util.Objects;
/**
* This class implements a method for compressing byte-wise files via Huffman-
* coding.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 14, 2025)
* @since 1.0.0 (Nov 14, 2025)
*/
public final class HuffmanByteCompressor {
/**
* Compresses the {@code rawData} {@code byte}-array using the input
* {@code weightDistribution}.
*
* @param rawData the raw data to compress.
*
* @return the full binary {@code byte}-array containing all the data needed
* to decompress the compressed file.
*/
public static byte[] compress(final byte[] rawData) {
Objects.requireNonNull(rawData);
if (rawData.length == 0) {
throw new IllegalArgumentException("The input byte array is empty");
}
final ByteFrequencyDistribution byteWeightDistribution =
ByteWeightDistributionBuilder
.buildByteWeightDistribution(rawData);
final ByteHuffmanCodeTable codeTable =
ByteHuffmanCodeTableBuilder.buildCode(byteWeightDistribution);
final int countNumberOfBytesInCodeHeader =
Utils.countBytesInCodeHeader(codeTable.size());
final long countNumberOfBytesInRawData =
Utils.countBitsInRawData(codeTable,
rawData);
final byte[] outputData =
new byte[(int)(countNumberOfBytesInCodeHeader +
countNumberOfBytesInRawData)];
final ByteArrayHeaderWriter headerWriter =
new ByteArrayHeaderWriter(rawData.length,
outputData,
codeTable);
headerWriter.write();
final long startingDataBitIndex = headerWriter.getDataStartBitIndex();
final ByteArrayCompressedDataWriter dataWriter =
new ByteArrayCompressedDataWriter(outputData,
rawData,
startingDataBitIndex,
codeTable);
dataWriter.write();
return outputData;
}
}
package io.github.coderodde.compressor.app;
/**
* This class implements a method for <b>decompressing</b> byte-wise files via
* Huffman-coding.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 14, 2025)
* @since 1.0.0 (Nov 14, 2025)
*/
public final class HuffmanByteDecompressor {
public static byte[] decompress(final byte[] compressedData) {
final ByteArrayHeaderReader headerReader =
new ByteArrayHeaderReader(compressedData);
final int rawDataLength = headerReader.getRawDataLength();
final byte[] rawData = new byte[rawDataLength];
final ByteHuffmanCodeTable codeTable = headerReader.getCodeTable();
final ByteHuffmanDecoderTree<Byte> decoder =
new ByteHuffmanDecoderTree<>(codeTable);
final int startingBitIndex =
Utils.countBytesInCodeHeader(codeTable.size()) * Byte.SIZE;
final ByteArrayCompressedDataReader dataReader =
new ByteArrayCompressedDataReader(rawData,
compressedData,
startingBitIndex,
decoder);
dataReader.read();
return rawData;
}
}
package io.github.coderodde.compressor.app;
/**
* This exception class implements an exception that is thrown if
* {@link io.github.coderodde.compressor.app.WeightDistribution} does not contain a
* requested symbol.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 13, 2025)
* @since 1.0.0 (Nov 13, 2025)
*/
public class SymbolNotFoundException extends RuntimeException {
public SymbolNotFoundException(final String exceptionMessage) {
super(exceptionMessage);
}
}
package io.github.coderodde.compressor.app;
/**
* The instances of this class denote the situations where the compressed raw
* data is too short.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 16, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class TooShortRawDataLengthException extends RuntimeException {
public TooShortRawDataLengthException(final String exceptionMessage) {
super(exceptionMessage);
}
}
package io.github.coderodde.compressor.app;
import static io.github.coderodde.compressor.app.Configuration.BYTES_PER_BYTE_DESCRIPTOR;
import static io.github.coderodde.compressor.app.Configuration.BYTES_PER_CODEWORD_LENGTH;
import static io.github.coderodde.compressor.app.Configuration.BYTES_PER_CODEWORD_MAX;
import static io.github.coderodde.compressor.app.Configuration.BYTES_PER_CODE_SIZE;
import static io.github.coderodde.compressor.app.Configuration.BYTES_PER_RAW_DATA_LENGTH;
/**
* This class contains some various helper methods.
*
* @author Rodion "rodde" Efremov
* @version 1.1.0 (Nov 21, 2025)
* @since 1.0.0 (Nov 16, 2025)
*/
public final class Utils {
private Utils() {
}
public static long countBitsInRawData(final ByteHuffmanCodeTable code,
final byte[] rawData) {
long bits = 0;
for (final byte b : rawData) {
bits += code.get(b).length();
}
return bits / Byte.SIZE + (bits % Byte.SIZE != 0 ? 1L : 0L);
}
public static int getCodeEntryLength() {
return BYTES_PER_BYTE_DESCRIPTOR +
BYTES_PER_CODEWORD_LENGTH +
BYTES_PER_CODEWORD_MAX;
}
public static int countBytesInCodeHeader(final int codeSize) {
final int codeEntryLength = getCodeEntryLength();
return (codeSize * codeEntryLength) + BYTES_PER_CODE_SIZE
+ BYTES_PER_RAW_DATA_LENGTH;
}
}
package io.github.coderodde.compressor.app;
import java.util.Set;
/**
* The instances of this class hold sets of symbols with the sum of their
* frequencies.
*
* @author Rodion "rodde" Efremov
* @version 1.0.0 (Nov 14, 2025)
* @since 1.0.0 (Nov 14, 2025)
*/
final class WeightedByteSet implements Comparable<WeightedByteSet> {
private final Set<Byte> set;
private final long totalSetWeight;
WeightedByteSet(final Set<Byte> set,
final long totalSetWeight) {
this.set = set;
this.totalSetWeight = totalSetWeight;
}
Set<Byte> getSet() {
return set;
}
long getTotalWeight() {
return totalSetWeight;
}
@Override
public int compareTo(final WeightedByteSet o) {
return Double.compare(totalSetWeight, o.totalSetWeight); // Should be Long.compare
}
}
package io.github.coderodde.compressor.app;
import java.util.Arrays;
import org.junit.Test;
import static org.junit.Assert.*;
public class ByteArrayCompressedDataReaderTest {
private static final int STRESS_TEST_ITERATIONS = 50;
private static final byte[] COMPRESSED_DATA = new byte[10_000];
@Test
public void readSmallTest() {
final byte[] rawData = { 45, 46, 47, 47, 46, 47 };
final ByteFrequencyDistribution wd =
ByteWeightDistributionBuilder
.buildByteWeightDistribution(rawData);
final ByteHuffmanCodeTable codeTable =
ByteHuffmanCodeTableBuilder.buildCode(wd);
final ByteArrayCompressedDataWriter writer =
new ByteArrayCompressedDataWriter(COMPRESSED_DATA,
rawData,
0,
codeTable);
writer.write();
final ByteHuffmanDecoderTree<Byte> decoderTree =
new ByteHuffmanDecoderTree<>(codeTable);
final byte[] resultRawData = new byte[rawData.length];
final ByteArrayCompressedDataReader reader =
new ByteArrayCompressedDataReader(resultRawData,
COMPRESSED_DATA,
0,
decoderTree);
reader.read();
assertTrue(Arrays.equals(rawData, resultRawData));
}
// @Test
public void readStressTest() {
for (int i = 0; i < STRESS_TEST_ITERATIONS; ++i) {
Arrays.fill(COMPRESSED_DATA, (byte) 0);
final byte[] rawData = TestUtils.getRawData();
// Write the compressed data:
final ByteFrequencyDistribution wd =
ByteWeightDistributionBuilder
.buildByteWeightDistribution(rawData);
final ByteHuffmanCodeTable codeTable =
ByteHuffmanCodeTableBuilder.buildCode(wd);
final ByteArrayCompressedDataWriter writer =
new ByteArrayCompressedDataWriter(COMPRESSED_DATA,
rawData,
0,
codeTable);
writer.write();
// Read the source data:
final ByteHuffmanDecoderTree<Byte> decoderTree =
new ByteHuffmanDecoderTree<>(codeTable);
final byte[] resultRawData = new byte[rawData.length];
final ByteArrayCompressedDataReader reader =
new ByteArrayCompressedDataReader(resultRawData,
COMPRESSED_DATA,
0,
decoderTree);
reader.read();
assertTrue(Arrays.equals(rawData, resultRawData));
}
}
}
package io.github.coderodde.compressor.app;
import java.util.Arrays;
import org.junit.Test;
import static org.junit.Assert.*;
import org.junit.Before;
public final class ByteArrayHeaderReaderTest {
private static final int STRESS_TEST_ITERATIONS = 50;
private static final byte[] COMPRESSED_DATA = new byte[10_000];
@Before
public void clearCompressedData() {
Arrays.fill(COMPRESSED_DATA, (byte) 0);
}
@Test
public void smallTest() {
final byte[] rawData = { 88, 40, 48 };
final ByteFrequencyDistribution wd =
ByteWeightDistributionBuilder
.buildByteWeightDistribution(rawData);
final ByteHuffmanCodeTable expectedCodeTable =
ByteHuffmanCodeTableBuilder.buildCode(wd);
final ByteArrayHeaderWriter writer = new ByteArrayHeaderWriter(rawData.length,
COMPRESSED_DATA,
expectedCodeTable);
writer.write();
final ByteArrayHeaderReader reader =
new ByteArrayHeaderReader(COMPRESSED_DATA);
final int resultRawDataLength = reader.getRawDataLength();
final ByteHuffmanCodeTable resultCodeTable = reader.getCodeTable();
assertEquals(rawData.length, resultRawDataLength);
assertEquals(expectedCodeTable, resultCodeTable);
}
@Test
public void stressTest() {
for (int i = 0; i < STRESS_TEST_ITERATIONS; ++i) {
// Clear the output buffer in order to get rid of junk:
Arrays.fill(COMPRESSED_DATA, (byte) 0);
final byte[] rawData = TestUtils.getRawData();
final ByteFrequencyDistribution weightDistribution =
ByteWeightDistributionBuilder
.buildByteWeightDistribution(rawData);
final ByteHuffmanCodeTable expectedCodeTable =
ByteHuffmanCodeTableBuilder.buildCode(weightDistribution);
final ByteArrayHeaderWriter writer =
new ByteArrayHeaderWriter(rawData.length,
COMPRESSED_DATA,
expectedCodeTable);
writer.write();
final ByteArrayHeaderReader reader =
new ByteArrayHeaderReader(COMPRESSED_DATA);
final int rawDataLength = reader.getRawDataLength();
final ByteHuffmanCodeTable readCodeTable = reader.getCodeTable();
assertEquals(rawData.length, rawDataLength);
assertEquals(expectedCodeTable, readCodeTable);
}
}
}
package io.github.coderodde.compressor.app;
import java.util.Arrays;
import static org.junit.Assert.assertTrue;
import org.junit.Test;
public class HuffmanByteDecompressorTest {
private static final int STRESS_TEST_ITERATIONS = 50;
@Test
public void smallTest() {
final byte[] rawData = { 45, 46, 47, 47, 46, 47 };
final byte[] compressedRawData =
HuffmanByteCompressor.compress(rawData);
final byte[] resultData =
HuffmanByteDecompressor.decompress(compressedRawData);
assertTrue(Arrays.equals(rawData, resultData));
}
@Test
public void decompressStressTest() {
for (int i = 0; i < STRESS_TEST_ITERATIONS; ++i) {
stressTest();
}
}
private void stressTest() {
final byte[] sourceData = TestUtils.getRawData();
final byte[] compressedData =
HuffmanByteCompressor.compress(sourceData);
final byte[] targetData =
HuffmanByteDecompressor.decompress(compressedData);
assertTrue(Arrays.equals(sourceData, targetData));
}
}
package io.github.coderodde.compressor.app;
import java.util.Arrays;
import org.junit.Test;
import static org.junit.Assert.*;
public class HuffmanDecodingTreeTest {
@Test
public void constructAndDecode() {
final byte[] sourceData = { 45, 46, 47, 47, 46, 47 };
final byte[] targetData = new byte[10];
final ByteFrequencyDistribution wd =
ByteWeightDistributionBuilder
.buildByteWeightDistribution(sourceData);
final ByteHuffmanCodeTable codeTable =
ByteHuffmanCodeTableBuilder.buildCode(wd);
final ByteArrayCompressedDataWriter writer =
new ByteArrayCompressedDataWriter(targetData,
sourceData,
0,
codeTable);
writer.write();
final ByteHuffmanDecoderTree<Byte> decoderTree =
new ByteHuffmanDecoderTree<>(codeTable);
final byte[] decompressedData = new byte[sourceData.length];
final ByteArrayCompressedDataReader reader =
new ByteArrayCompressedDataReader(decompressedData,
targetData,
0,
decoderTree);
reader.read();
assertTrue(Arrays.equals(sourceData, decompressedData));
}
}
package io.github.coderodde.compressor.app;
import java.util.Random;
public class TestUtils {
private static final int SEED = 13;
private static final int MAXIMUM_BYTE_ARRAY_LENGTH = 2_000;
private static final Random RANDOM = new Random(SEED);
private TestUtils() {
}
public static final byte[] getRawData() {
final int length = 1 + RANDOM.nextInt(MAXIMUM_BYTE_ARRAY_LENGTH);
final byte[] rawData = new byte[length];
RANDOM.nextBytes(rawData);
return rawData;
}
}
Typical output
C:\Users\rodio\Documents>java -jar HuffmanCompressorApp.java-1.1.0.jar WarAndPeace.html
[INFO] Read all file bytes in 15 milliseconds.
[INFO] Compressed the data in 412 milliseconds.
[INFO] Written the compressed data in 19 milliseconds.
C:\Users\rodio\Documents>java -jar HuffmanCompressorApp.java-1.1.0.jar WarAndPeace.html.huf uncompressed.WarAndPeace.html
[INFO] Read the compressed data in 9 milliseconds.
[INFO] Decompressed the data in 137 milliseconds.
[INFO] Written the decompressed data in 24 milliseconds.
C:\Users\rodio\Documents>fc /b WarAndPeace.html uncompressed.WarAndPeace.html
Comparing files WarAndPeace.html and UNCOMPRESSED.WARANDPEACE.HTML
FC: no differences encountered
Critique request
Please, tell me anything that comes to mind.