0x00ff

Coding Challenges - Build Your Own Compression Tool

2024-11-17

I have attempted another coding challenge where your task is to build compression tool using Huffman coding.

The challenge links to the following website which explains how the Huffman coding works. According to the website I did these following steps

Get character frequencies from the text
Create Huffman tree from the character frequencies
Create Huffman codes from the tree
Encode the text using Huffman codes
Serialize the tree and write it together with the encoded text to a file
Read the file, deserialize the tree and build back the text by decoding the encoded characters

The first step is to get the frequencies of the letters in the text. I am using dictionary to store the frequencies. The function uses StreamReader to go through the text so I don't have to read the whole text into memory.

public static Dictionary<char, int> GetFrequency(StreamReader stream)
{
    var frequency = new Dictionary<char, int>();

    stream.BaseStream.Position = 0;
    var c = stream.Read();

    while (c != -1)
    {
        var ch = (char)c;

        if (frequency.ContainsKey(ch))
            frequency[ch]++;
        else
            frequency[ch] = 1;

        c = stream.Read();
    }

    return frequency;
}

Next step is creating the Huffman tree from the letter frequencies. For the tree I have created a Node class which holds the weight, character, left and right subtree.

public class Node
{
    public Node(int weight = 0, char? @char = null, Node? left = null, Node? right = null)
    {
        Weight = weight;
        Char = @char;
        Left = left;
        Right = right;
    }

    public int Weight { get; set; }
    public char? Char { get; set; }
    public Node? Left { get; set; }
    public Node? Right { get; set; }
}

Tree is created using priority queue. C# provides PriorityQueue class in the System.Collections.Generic namespace.

public static Node CreateTree(Dictionary<char, int> frequency)
{
    var queue = new PriorityQueue<Node, int>();

    foreach (var kv in frequency)
        queue.Enqueue(new Node(weight: kv.Value, @char: kv.Key), kv.Value);

    while (queue.Count > 1)
    {
        var leftTree = queue.Dequeue();
        var rightTree = queue.Dequeue();

        var root = new Node(leftTree.Weight + rightTree.Weight, left: leftTree, right: rightTree);

        queue.Enqueue(root, root.Weight);
    }

    return queue.Dequeue();
}

From the tree we are able to generate code for each letter used in the text. I wrote a function using depth first search - Dfs.

public static Dictionary<char, LutElement> CreateCodes(Node root)
{
    var res = new Dictionary<char, LutElement>();

    if (root.Left == null && root.Right == null)
    {
        res.Add((char)root.Char!, new LutElement { Weight = root.Weight, Code = "0" });
        return res;
    }

    void Dfs(Node node, string code = "")
    {
        if (node == null)
            return;

        if (node.Char != null)
            res.Add((char)node.Char, new LutElement { Weight = node.Weight, Code = code });

        if (node.Left != null)
            Dfs(node.Left, code + '0');

        if (node.Right != null)
            Dfs(node.Right, code + '1');
    }

    Dfs(root);

    return res;
}

LutElement class is used to hold the character weight and its code. I choose the word Lut as abbrevation of look up table.

public class LutElement
{
    public int Weight { get; set; }
    public string Code { get; set; } = default!;
}

Then I created these following functions to serialize and deserialize the Huffman tree. They are both using depth first search. Serialize function traverses the tree and appends bytes into List. I use bytes 0x00 to mark null node, 0x01 to mark weight and 0x02 to mark a character. Integers are stored in four bytes and characters in two bytes. To get the individual bytes I have used bit shifting.

public static byte[] SerializeTree(Node root)
{
    var res = new List();

    void Dfs(Node? node)
    {
        if (node == null)
        {
            res.Add(0x00);
            return;
        }
        else
        {
            res.Add(0x01);

            var weight = node.Weight;

            var b1 = (byte)(weight >> 24);
            var b2 = (byte)(weight >> 16);
            var b3 = (byte)(weight >> 8);
            var b4 = (byte)(weight >> 0);

            res.Add(b1);
            res.Add(b2);
            res.Add(b3);
            res.Add(b4);

            if (node.Char != null)
            {
                res.Add(0x02);

                var bc1 = (byte)(node.Char! >> 8);
                var bc2 = (byte)(node.Char! >> 0);

                res.Add(bc1);
                res.Add(bc2);
            }
            else
            {
                res.Add(0x01);
            }

            Dfs(node.Left);
            Dfs(node.Right);
        }
    }

    Dfs(root);

    return res.ToArray();
}

The Deserialize function takes the byte array and builds back the tree using depth first search. Integers and characters are built from individual bytes using bit shifting.

public static Node? DeserializeTree(byte[] bytes)
{
    var i = 0;

    Node? Dfs()
    {
        if (bytes[i] == 0x00)
        {
            i++;
            return null;
        }

        i++;

        var weight = 0;
        weight |= bytes[i + 0] << 24;
        weight |= bytes[i + 1] << 16;
        weight |= bytes[i + 2] << 8;
        weight |= bytes[i + 3] << 0;

        i += 4;

        char? @char = null;

        if (bytes[i] == 0x02)
        {
            i++;

            var ch = 0;
            ch |= bytes[i + 0] << 8;
            ch |= bytes[i + 1] << 0;

            @char = (char)ch;

            i += 2;
        }
        else if (bytes[i] == 0x01)
        {
            i++;
        }

        var node = new Node(weight, @char)
        {
            Left = Dfs(),
            Right = Dfs()
        };

        return node;
    }

    var root = Dfs();

    return root;
}

Everything is combined in the Encode and Decode functions. The check for 0xfeff bytes at the beginning of the loop in the Encode function is to skip the byte order mark of the file. I have chose the bufferSize to be 4096. First I write the length of the serialized tree into the file and the tree bytes. Then I simply loop through the encoded text building the BitArray. When the BitArray reaches the buffer size length it is written into the file.

public static void Encode(StreamReader streamReader, string destinationPath)
{
    var frequency = GetFrequency(streamReader);
    var tree = CreateTree(frequency);
    var codes = CreateCodes(tree);
    var treeBytes = SerializeTree(tree);

    using var destinationFileStream = new FileStream(destinationPath, FileMode.Create, FileAccess.Write, FileShare.Write, bufferSize, FileOptions.SequentialScan);
    using var binaryWriter = new BinaryWriter(destinationFileStream);

    var treeLength = treeBytes.Length;
    binaryWriter.Write(treeLength);
    binaryWriter.Write(treeBytes);

    var positionToWriteLength = binaryWriter.BaseStream.Position;

    binaryWriter.BaseStream.Position += sizeof(int);
    streamReader.BaseStream.Position = 0;

    var bitArray = new BitArray(bufferSize);
    int bitsLength = 0;
    var lengthToTake = 0;
    var first = true;
    var currLength = 0;

    var c = streamReader.Read();

    while (c != -1)
    {
        if (first && c == 0xfeff)
        {
            first = false;
        }
        else
        {
            var ch = (char)c;

            var currCode = codes[ch].Code;

            if (currLength + currCode.Length > bufferSize)
            {
                lengthToTake = bufferSize - currLength;

                for (var j = 0; j < lengthToTake; j++)
                {
                    bitArray[currLength] = currCode[j] == '1';
                    currLength++;
                }
            }
            else
            {
                foreach (var bit in currCode)
                {
                    bitArray[currLength] = bit == '1';
                    currLength++;
                }
            }

            if (currLength % bufferSize == 0)
            {
                bitsLength += currLength;

                var res = new byte[(bitArray.Length - 1) / 8 + 1];
                bitArray.CopyTo(res, 0);
                binaryWriter.Write(res);

                bitArray = new BitArray(bufferSize);
                currLength = 0;
            }

            if (lengthToTake > 0)
            {
                for (var j = lengthToTake; j < currCode.Length; j++)
                {
                    bitArray[currLength] = currCode[j] == '1';
                    currLength++;
                }

                lengthToTake = 0;
            }
        }

        c = streamReader.Read();
    }

    if (currLength > 0)
    {
        bitsLength += currLength;
        var res = new byte[(bitArray.Length - 1) / 8 + 1];
        bitArray.CopyTo(res, 0);
        binaryWriter.Write(res);
    }

    binaryWriter.BaseStream.Position = positionToWriteLength;
    binaryWriter.Write(bitsLength);
}

public static void Decode(string sourcePath, string destinationPath)
{
    using var sourceFileStream = new FileStream(sourcePath, FileMode.Open, FileAccess.Read, FileShare.Read, bufferSize, FileOptions.SequentialScan);
    using var sourceBinaryReader = new BinaryReader(sourceFileStream);

    using var destinationFileStream = new FileStream(destinationPath, FileMode.Create, FileAccess.Write, FileShare.Write, bufferSize, FileOptions.SequentialScan);
    using var destinationStreamWriter = new StreamWriter(destinationFileStream);

    var treeLength = sourceBinaryReader.ReadInt32();
    var treeBytes = sourceBinaryReader.ReadBytes(treeLength);
    var root = DeserializeTree(treeBytes);

    var encodedTextLength = sourceBinaryReader.ReadInt32();
    var processedBitsCount = 0;

    var currentNode = root;

    while (processedBitsCount < encodedTextLength)
    {
        var buffer = sourceBinaryReader.ReadBytes(bufferSize);

        if (buffer.Length == 0)
            break;

        var textBits = new BitArray(buffer);

        for (var i = 0; i < textBits.Length; i++)
        {
            var bit = textBits[i];

            if (currentNode!.Left == null && currentNode!.Right == null)
            {
                destinationStreamWriter.Write(currentNode.Char);
            }
            else
            {
                currentNode = bit ? currentNode!.Right : currentNode!.Left;

                if (currentNode!.Left == null && currentNode!.Right == null)
                {
                    destinationStreamWriter.Write(currentNode.Char);
                    currentNode = root;
                }
            }

            processedBitsCount++;

            if (processedBitsCount >= encodedTextLength)
                break;
        }
    }
}