;;;; a simple huffman encoder/decoder, used to compress unicode ;;;; character names. ;;;; This software is part of the SBCL system. See the README file for ;;;; more information. ;;;; ;;;; This software is derived from the CMU CL system, which was ;;;; written at Carnegie Mellon University and released into the ;;;; public domain. The software is in the public domain and is ;;;; provided with absolutely no warranty. See the COPYING and CREDITS ;;;; files for more information. (in-package "SB!IMPL") (defstruct (huffman-node (:constructor make-huffman-node (key weight))) key weight) (defstruct (huffman-pair (:include huffman-node) (:constructor make-huffman-pair (left right &aux (key (concatenate 'string (huffman-node-key left) (huffman-node-key right))) (weight (+ (huffman-node-weight left) (huffman-node-weight right)))))) left right) (defun huffman-weights (corpus) (let ((weight-table (make-hash-table :test #'equal))) (loop for string in corpus do (loop for char across string do (incf (gethash char weight-table 0)))) (let (alist) (maphash (lambda (char weight) (push (make-huffman-node (string char) weight) alist)) weight-table) alist))) (defun make-huffman-tree (corpus) (labels ((merge-table (table) (setf table (sort table #'< :key #'huffman-node-weight)) (push (make-huffman-pair (pop table) (pop table)) table) (if (second table) (merge-table table) (car table))) (finish-tree (tree) (if (huffman-pair-p tree) (list (huffman-node-key tree) (finish-tree (huffman-pair-left tree)) (finish-tree (huffman-pair-right tree))) (huffman-node-key tree)))) (finish-tree (merge-table (huffman-weights corpus))))) (defun huffman-decode (code tree) (let ((original code)) (labels ((pop-bit () (let* ((bits (integer-length code)) (bit (ldb (byte 1 (- bits 2)) code))) (setf code (dpb 1 (byte 1 (- bits 2)) (ldb (byte (- bits 1) 0) code))) bit)) (choose (branch) (destructuring-bind (key left right) branch (declare (ignore key)) (if (zerop (pop-bit)) left right))) (decode (branch) (when (zerop code) (error "Invalid Huffman-code: ~S" original)) (let ((next (choose branch))) (cond ((consp next) (decode next)) ((< 1 code) (concatenate 'string next (decode tree))) (t next))))) (decode tree)))) (defun huffman-match (char node) (if (consp node) (find char (the string (car node)) :test #'equal) (eql char (character node)))) (defun huffman-encode (string tree) (let ((code 1)) (labels ((encode (bit char tree) (when bit (setf code (+ (ash code 1) bit))) (if (consp tree) (destructuring-bind (key left right) tree (declare (ignore key)) (cond ((huffman-match char left) (encode 0 char left)) ((huffman-match char right) (encode 1 char right)) (t ;; unknown (return-from huffman-encode nil)))) (unless (huffman-match char tree) (error "Error encoding ~S (bad tree)." char))))) (loop for char across string do (encode nil char tree)) code)))