;; -*- mode: common-lisp; package: util.zip -*-
;;
;; inflate.cl
;;
;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA 
;;
;; This code is free software; you can redistribute it and/or
;; modify it under the terms of the version 2.1 of
;; the GNU Lesser General Public License as published by 
;; the Free Software Foundation, as clarified by the AllegroServe
;; prequel found in license-allegroserve.txt.
;;
;; This code is distributed in the hope that it will be useful,
;; but without any warranty; without even the implied warranty of
;; merchantability or fitness for a particular purpose.  See the GNU
;; Lesser General Public License for more details.
;;
;; Version 2.1 of the GNU Lesser General Public License is in the file 
;; license-lgpl.txt that was distributed with this file.
;; If it is not present, you can access it from
;; http://www.gnu.org/copyleft/lesser.txt (until superseded by a newer
;; version) or write to the Free Software Foundation, Inc., 59 Temple Place, 
;; Suite 330, Boston, MA  02111-1307  USA
;;
;;
;; $Id: inflate.cl,v 1.1.4.2 2002/06/19 02:50:55 layer Exp $

;; Description:
;;   inflate a stream of bytes which was compressed with the Deflate
;;   algorithm
;;
;;   john foderaro, August 2001
;;
;;- This code in this file obeys the Lisp Coding Standard found in
;;- http://www.franz.com/~jkf/coding_standards.html
;;-


#|
Programming interface:

(inflate input-stream output-stream)
- the compressed information from the input-stream is read and 
  the uncompressed information is written to the output-stream
- both streams must support (unsigned-byte 8) element reading and writing


(skip-gzip-header input-stream)
- if the input stream is positioned on the header of a gzip'ed file
   then skip that header.
- if the input stream is not positioned on a gzip header then nothing 
  is done.

|#


#|
		The Deflate Compression Algorithm

reference: http://www.gzip.org/zlib/rfc-deflate.html

Basic idea:
Deflation is a means of compressing an octet sequence that
combines the LZ77 algorithm for marking common substrings and
Huffman coding to take advantage of different frequency of occurance
for each possible values in the file.
This algorithm may not be as easy to understand or as efficient
as the LZW compression algorithm but Deflate does have the big
advantage in that it is not patented.  Thus Deflate is a very
widely used.  Nowdays it's the most common compression method
used in Windows Zip programs (e.g. Winzip) and in the Unix gzip program.
Java jar files, being just zip files, also use this compression method.


Lempel-Ziv 1977 (LZ77):
An octet sequence often contains repeated subsequences.  The LZ algorithm
compresses a file by replacing repeated substrings with (Length,Distance)
markers which mean during decompression: Go back Distance octets 
in output stream and copy Length bytes to the output stream.  

Huffman Coding:
A Huffman code for a set of values V assigns a unique bitsequence
to each value in V.   A bitsequence is a sequence of 0's and 1'.
An important property of Huffman codes is that if X is a bitsequence
for a value in V then no other value in V has a bitsequence 
with X as a prefix of that sequence.  This means that if you see
the bitsequence X in the stream you know that this denotes the value
v and you don't have to read any more bits.


Blocks:
A deflated file is a sequence of blocks.  There are three types of
blocks:
1. uncompressed - The block simply contains the same sequence of 
octets as were found in the input stream.  This type of block
is useful when the input stream has already been compressed (e.g.
it's a jpg or gif file) as compressing a compressed file often
results in the file getting larger.

2. compressed with fixed Huffman code - The block contains a 
huffman-coded LZ77 compressed bitsequence.  The huffman code
used is specified by the deflate algorithm.   This type of block
is useful when the octet sequence is short since in that case
the overhead of creating a custom huffman code is more than is gained
by that custom code.

3. compressed with a custom Huffman code - The block contains
a description of a Huffman code to be used in this block only
and then a Huffman-code LZ77 compressed bitsequence.  The values
that describe the custome huffman tree are themselves huffman coded.
  

|#

(defpackage :util.zip (:use :common-lisp :excl)
	    (:export #:inflate
		     #:skip-gzip-header))


(in-package :util.zip)

(provide :inflate)

(defun inflate (p op)
  ;; user callable
  ;; inflate the stream p into the stream op
  ;; both streams should be unsigned-byte 8
  ;;
  (let ((br (new-bit-reader p))
	(buffer (make-array (* 32 1024) :element-type '(unsigned-byte 8)))
	(end 0))
    (loop
      (if* (null (setq end (process-deflate-block br op buffer end)))
	 then ; last block, we're all done
	      (return)))))


;;; ------------ gzip support
;
; gzip preceeds files with a header and the only support we need
; give to handle gzip files is the ability to skip the header
; and get to the meat of the file


; gzip constants

; compression strategies (only one supported)
(defconstant z_deflated 8)

; flag bits
(defconstant gz_ascii_flags #x01)   ; file probably ascii
(defconstant gz_head_crc    #x02)   ; header crc present
(defconstant gz_extra_field #x04)   ; extra field present
(defconstant gz_orig_name   #x08)   ; original file name present
(defconstant gz_comment     #x10)   ; file comment present
(defconstant gz_reserved    #xe0)   ; no bits allowed on here

(defun skip-gzip-header (p)
  ;; If the next thing in the stream p is gzip header then skip
  ;; past it and return t.
  ;; If it's not a gzip header than return nil
  ;; If it's starts to look like a gzip header but turns out to 
  ;; not be valid signal an error.  Note that the first byte of
  ;; a gzip header is an illegal byte to begin a deflated stream so
  ;; that if the first byte matches a gzip header but the rest do not
  ;; then the stream was positioned at neither a gzip header nor a
  ;; deflated stream
  ;
  ;; see check_header in gzio.c in rpm zlib-1.1.3 (or variant)
  ;; for details on what's in the header.
  
  (let (method flags)
    
    ; look for magic number
    (if* (not (eql #x1f (read-byte p)))
       then ; not a gzip header, may be a deflate block
	    (unread-char (code-char #x1f) p)
	    (return-from skip-gzip-header nil))
    

    ; now check the second magic number
    (if* (not (eql #x8b (read-byte p)))
       then (error "non gzip magic number"))
  
    (setq method (read-byte p)
	  flags  (read-byte p))

    (if* (or (not (eql method z_deflated))
	     (not (zerop (logand flags gz_reserved))))
       then (error "bad method/flags in header"))
  
    ; discard time, xflags and os code */
    (dotimes (i 6) (read-byte p))
  
    ; discard extra field if present
    (if* (logtest flags gz_extra_field)
       then (let ((length (+ (read-byte p)
			     (ash (read-byte p) 8))))
	      (dotimes (i length) (read-byte p))))
  
    (if* (logtest flags gz_orig_name)
       then ; discard name of file, null terminated
	    (do ((val (read-byte p) (read-byte p)))
		((zerop val))))
  
    (if* (logtest flags gz_comment)
       then ; discard comment, null terminated
	    (do ((val (read-byte p) (read-byte p)))
		((zerop val))))
  
    (if* (logtest flags gz_head_crc)
       then ; discard header crc
	    (dotimes (i 2) (read-byte p)))

    ; success!
    t	
    ))
		
;;;----------- end gzip support


;;;----------- support for reading bitfields from a stream
  
  
(defstruct bit-reader 
  stream
  last-byte	; last byte read, possibly two combined bytes too
  bits		; bits left of last byte to use
  )

(defparameter *maskarray*
    ;; for a bit length, mask off junk bits
    (make-array 17 
		 :initial-contents 
		 '(#x0 
		   #x1    #x3    #x7    #xf
		   #x1f   #x3f   #x7f   #xff
		   #x1ff  #x3ff  #x7ff  #xfff
		   #x1fff #x3fff #x7fff #xffff)))

;; bit reader
(defun new-bit-reader (stream)
  ; create and initialize bit reader
  (make-bit-reader :stream stream :last-byte 0 :bits 0))

(defun reset-bit-reader (br)
  ; clear out unused bit of the current byte
  (setf (bit-reader-bits br) 0))

(defun read-bits (br count)
  ;; return a value from the current bit reader.
  ;; the count can be from 1 to 16
  ;;
  
  (if* (eql count 0)
     then (return-from read-bits 0))
  
  
  (let ((last-byte (bit-reader-last-byte br))
	(bits      (bit-reader-bits br)))
    (loop 
      (if* (>= bits count)
	 then ;we have enough now
	      (if* (> bits count)
		 then ; we have some left over
		      (setf (bit-reader-last-byte br)
			(ash last-byte (- count)))
		      (setf (bit-reader-bits br) (- bits count))
		      (return (logand last-byte (svref *maskarray* count)))
		 else ; no bits left
		      (setf (bit-reader-bits br) 0)
		      (setf (bit-reader-last-byte br) 0)
		      (return last-byte)
		      )
	 else ; need a new byte
	      (let ((new-byte (read-byte (bit-reader-stream br))))
		(setq last-byte (+ last-byte
				   (ash new-byte bits)))
		(incf bits 8))))))


;;;----------- end bitfield reading


;;;----------- build constant tables needed by the algorithm

;; The tables needed to decode length and distance values
;; A compressed file contains a sequence of literal character values
;; or (length,distance) pairs.  The length is computed by taking
;; the length-value in the file and using these tables to bind
;; a base length value and the number of extra bits to read from the file
;; and then to add to the length value.
;; The same is done for distance.

(defvar *base-length*) ; array mapping code to length value
(defvar *length-extra-bits*) ; array saying how many more bitsworth to read

(defvar *base-distance*)
(defvar *distance-extra-bits*)


; build those arrays at load time:

(progn
   (setq *base-length* (make-array (1+ (- 285 257)))
	 *length-extra-bits* (make-array (1+ (- 285 257))))
  
   (let ((len 3)
	 (ind 0))
     (dolist (ent '((8 0)  ; count and number of extra bits
		    (4 1) (4 2) (4 3) (4 4) (4 5) (1 0)))
       (dotimes (i (car ent)) 
	 (setf (svref *base-length* ind) len)
	 (setf (svref *length-extra-bits* ind) (cadr ent))
	 (incf ind 1)
	 (incf len (ash 1 (cadr ent)))
	 )
       ; special case, code 285 is length 258.  
       (setf (svref *base-length* (- 285 257)) 258)
       ))

   (setq *base-distance* (make-array (1+ (- 29 0)))
	 *distance-extra-bits* (make-array (1+ (- 29 0))))
  
   (let ((dist 1)
	 (ind 0))
     (dolist (ent '((4 0) ; count and number of extra bits
		    (2 1) (2 2) (2 3) (2 4) (2 5) (2 6) (2 7) (2 8)
		    (2 9) (2 10) (2 11) (2 12) (2 13)))
       (dotimes (i (car ent))
	 (setf (svref *base-distance* ind) dist)
	 (setf (svref *distance-extra-bits* ind) (cadr ent))
	 (incf ind 1)
	 (incf dist (ash 1 (cadr ent)))))))


;;;----------- end table building


;;;----------- Huffman tree support

(defstruct (bitinfo (:type list))
  ;; when we describe a range of values and the code width we
  ;; use a list of three elements.  this structure describes it
  minval
  maxval
  bitwidth)


;test case
; (generate-huffman-tree '((0 4 3) (5 5 2) (6 7 4)))
; will generate sample table from the Deutsch paper
;

(defun generate-huffman-tree (bitinfo)
  ;; bitinfo is a list of bitinfo items (minval maxval bitwidth)
  ;; which means that values from minval through maxval are
  ;; to be represented by codes of width bitwidth.
  ;;
  ;; we return two valuse: the huffman tree and the mininum bit width
  ;;
  (let ((maxval 0)
	(minval most-positive-fixnum)
	(maxbitwidth 0)
	(minbitwidth most-positive-fixnum)
	bitwidthcounts
	valuecode
	valuewidth
	nextcode
	)
    ; find out the range of values (well the max) and the max bit width
    (dolist (bi bitinfo)
      (setq maxval (max maxval (bitinfo-maxval bi)))
      (setq minval (min minval (bitinfo-minval bi)))
      (setq maxbitwidth (max maxbitwidth (bitinfo-bitwidth bi)))
      (setq minbitwidth (min minbitwidth (bitinfo-bitwidth bi)))
      )
  
    ; per bitwidth arrays
    (setq bitwidthcounts (make-array (1+ maxbitwidth) 
				     :initial-element 0))
    (setq nextcode (make-array (1+ maxbitwidth) 
			       :initial-element 0))
  
    ; per value arrays
    (setq valuecode (make-array (1+ (- maxval minval)))) ; huffman code chose
    (setq valuewidth (make-array (1+ (- maxval minval))
				 :initial-element 0)) ; bit width
  
    (dolist (bi bitinfo)
      ; set valuewidth array from the given data
      (do ((v (bitinfo-minval bi) (1+ v)))
	  ((> v (bitinfo-maxval bi)))
	(setf (svref valuewidth (- v minval)) (bitinfo-bitwidth bi)))
    
      ; keep track of how many huffman codes will have a certain bit width
      (incf (svref bitwidthcounts (bitinfo-bitwidth bi))
	    (1+ (- (bitinfo-maxval bi) (bitinfo-minval bi))))
      )
  
  
    ; compute the starting code for each bit width
    (let ((code 0))
      (dotimes (widthm1 maxbitwidth)
	(setq code 
	  (ash (+ code (svref bitwidthcounts widthm1)) 1))
	(setf (svref nextcode (1+ widthm1)) code)))
  
    ; compute the huffman code for each value
    (do ((v minval (1+ v)))
	((> v maxval))
      (let ((width (svref valuewidth (- v minval))))
	(if* (not (zerop width))
	   then ; must assign a code
		(setf (svref valuecode (- v minval))
		  (svref nextcode width))
		(incf (svref nextcode width)))))

    ;; now we know the code for each value in the valuecode array
    ;;
    ;; now compute the tree
    (values (build-huffman-tree 
	     minval
	     (mapcar #'(lambda (bi) (cons (car bi) (cadr bi))) bitinfo)
	     valuecode valuewidth 1)
	    ; second value useful for decoding:
	    minbitwidth)))


(defun build-huffman-tree (minval minmaxes valuecode valuewidth pos)
  ;; compute a huffman cons tree
  ;; minmaxes is a list of conses. each cons 
  ;; representing a (min . max) range of values.
  ;;
  
  (multiple-value-bind (zero one) (split-on-position minval minmaxes 
						     valuecode
						     valuewidth
						     pos)
    (cons (if* (consp zero)
	     then (build-huffman-tree minval 
				      zero valuecode valuewidth (1+ pos))
	     else zero)
	  (if* (consp one)
	     then (build-huffman-tree minval one valuecode valuewidth (1+ pos))
	     else one))))

(defun split-on-position (minval minmaxes valuecode valuewidth pos)
  ;; compute those values that have a zero in the pos (1 based) position
  ;; of their code and those that have one in that position.
  ;; return two values, the zero set and the one set.
  ;; The position is from the msbit of the huffman code.
  ;;
  ;; If the value of the specified pos selects a specific value
  ;; and no further bits need be read to identify that value then
  ;; we return that value rather than a list of conses.
  
  (let (zero one)
    (dolist (mm minmaxes)
      (do ((v (car mm) (1+ v)))
	  ((> v (cdr mm)))
	(let ((width (svref valuewidth (- v minval)))
	      (code  (svref valuecode  (- v minval))))
	  (if* (logbitp (- width pos) code)
	     then ; one bit set
		  (if* (eql width pos)
		     then ; last bit
			  (setq one v)
		     else ; more bits to check
			  (let ((firstone (car one)))
			    (if* (and firstone 
				      (eq (cdr firstone) (1- v)))
			       then ; increase renge
				    (setf (cdr firstone) v)
			       else (push (cons v v) one))))
	     else ; zero bit set
		  (if* (eql width pos)
		     then ; last bit
			  (setq zero v)
		     else ; more bits to check
			  (let ((firstzero (car zero)))
			    (if* (and firstzero
				      (eq (cdr firstzero) (1- v)))
			       then ; increase renge
				    (setf (cdr firstzero) v)
			       else (push (cons v v) zero))))))))
    (values 
     (if* (consp zero) then (nreverse zero) else zero) ; order numerically
     (if* (consp one)  then (nreverse one)  else one))))


(defun generate-huffman-tree-from-vector (vector start end)
  ;; generate huffman tree from items in the vector from start to end-1
  ;; assume start corresponds to value 0 in the tree
  (do ((i start (1+ i))
       (val 0 (1+ val))
       (res))
      ((>= i end)
       (generate-huffman-tree (nreverse res)))
    (let ((len (svref vector i)))
      (if* (> len 0) 
	 then (push (list val val len) res)))))

      
;; the huffman tree to use for type 1 blocks
;;
(defparameter *fixed-huffman-tree* 
    (generate-huffman-tree '((0 143 8) (144 255 9) (256 279 7) (280 287 8))))

;; distance are represented by a trivial huffman code
(defparameter *fixed-huffman-distance-tree* 
    (generate-huffman-tree '((0 31 5))))


;;;----------- end Huffman support


(defun process-deflate-block (br op buffer end)
  ;; br is a bit stream, op is the output stream
  ;; process the next block in the stream
  ;; return false if this is the last block of data else
  ;; return the next index into the buffer
  (let ((bfinal (read-bits br 1))
	(btype  (read-bits br 2)))
    
    (setq end
      (case btype
	(0 (process-non-compressed-block br op buffer end))
	(1 (process-fixed-huffman-block br op buffer end))
	(2 (process-dynamic-huffman-block br op buffer end))
	(3 (error "illegal deflate block value"))))
    (if* (eql bfinal 1) 
       then (flush-buffer op buffer end)
	    nil
       else end)
    ))


(defun process-non-compressed-block (br op buffer end)
  ;; process a block of uncompressed data
  (reset-bit-reader br)
  (let ((p (bit-reader-stream br)))
    (let ((len (read-uword p))
	  (onecomplen (read-uword p)))
      (if* (not (eql len (logxor #xffff onecomplen)))
	 then (error "bad length value in non compressed block"))
      (dotimes (i len)
	(setq end (put-byte-in-buffer op (read-byte p) buffer end))))
    end))

(defun read-uword (stream)
  ;; read a little endian value
  (+ (read-byte stream) (ash (read-byte stream) 8)))

(defun put-byte-in-buffer (op byte buffer end)
  ;; store the next output byte in the buffer
  (if* (>= end (length buffer))
     then (flush-buffer op buffer end)
	  (setq end 0))
  (setf (aref buffer end) byte)
  (1+ end))

(defun flush-buffer (op buffer end)
  ;; send bytes to the output stream. If op isn't a stream
  ;; then it must be a function to funcall to take the bytes.
  (if* (> end 0) 
     then (if* (streamp op)
	     then (write-sequence buffer op :end end)
	     else (funcall op buffer end))))


(defun process-fixed-huffman-block (br op buffer end)
  ;; process a huffman block with the standard huffman tree
  ;;
  (process-huffman-block br op *fixed-huffman-tree* 7 *fixed-huffman-distance-tree* 5
			 buffer end))

(defun process-huffman-block (br op 
			      lengthlit-tree minwidth 
			      distance-tree mindistwidth
			      buffer end)
  ;; the common code for blocks of type 1 and 2 that does
  ;; the decompression given  a length/literal huffman tree
  ;; and a distance huffman tree.
  ;; If the distance tree is nil then we use the trivial huffman 
  ;; code from the algorithm.
  ;;
  (let* ((bufflen (length buffer))
	 length
	 distance
	 )
    
		 
    (loop
      (let ((value (decode-huffman-tree br lengthlit-tree minwidth)))
	(if* (< value 256)
	   then ; output and add to buffer
		(setq end (put-byte-in-buffer op value buffer end))
		
	 elseif (eql value 256) 
	   then (return) ; end of block
	   else ; we have a length byte
		; compute length, distance
		  
		(let ((adj-code (- value 257)))
		  (setq length (+ (svref *base-length* adj-code)
				  (read-bits br (svref *length-extra-bits*
						       adj-code)))))
		
		(let ((dist-code (if* distance-tree
				    then (decode-huffman-tree br
							      distance-tree
							      mindistwidth)
				    else (read-bits br 5))))
		  (setq distance 
		    (+ (svref *base-distance* dist-code)
		       (read-bits br (svref *distance-extra-bits*
					    dist-code)))))
		  
		; copy in bytes
		(do ((i (mod (- end distance) bufflen) (1+ i))
		     (count length (1- count)))
		    ((<= count 0))
		  (if* (>= i bufflen) then (setf i 0))
		  (setq end (put-byte-in-buffer op
						(aref buffer i)
						buffer
						end))))))
    ; return where we left off
    end))
		    
		
(defparameter *code-index*
    ;; order of elements in the code index values
    ;; pretty crazy, eh?
    (make-array 19 
		:initial-contents
		'(16 17 18 0 8 7 9 6 10 5 11 4 12 3 13 2 14 1 15)))

		  
(defun process-dynamic-huffman-block (br op buffer end)
  ;; process a block that includes a personalized huffman tree
  ;; just for this block
  (let ((hlit (read-bits br 5))
	(hdist (read-bits br 5))
	(hclen (read-bits br 4))
	
	code-length-huffman-tree
	(minlen 9999) 
	)
    
    ; read in the huffman code width of each of the numbers
    ; from 0 18... this will be then used to create a huffman tree
    ;
    (let ((codevec (make-array 19 :initial-element 0))
	  (len))
      
      (dotimes (i (+ hclen 4))
	(setf (svref codevec 
		     (svref *code-index* i))
	  (setq len (read-bits br 3)))
	(if* (> len 0) then (setq minlen (min len minlen))))
      
      
      (setq code-length-huffman-tree 
	(generate-huffman-tree-from-vector codevec 0 (length codevec))))
    
    ; now we're in position to read the code lengths for the
    ; huffman table that will allow us to read the data.
    ; (Is this a nutty algorithm or what??)
    ;
    (let ((bigvec (make-array (+ hlit 257 hdist 1)
			      :initial-element 0))
	  (index 0))
      (loop
	(if* (>= index (length bigvec)) then (return))
	(let ((val (decode-huffman-tree br code-length-huffman-tree minlen)))
	  (if* (<= val 15)
	     then ; literal value
		  (setf (svref bigvec index) val)
		  (incf index)
	   elseif (eql val 16)
	     then ; repeat prev
		  (let ((prev-val (svref bigvec (1- index))))
		    (dotimes (i (+ 3 (read-bits br 2)))
		      (setf (svref bigvec index) prev-val)
		      (incf index)))
	   elseif (eq val 17)
	     then ; repeat zero
		  (dotimes (i (+ 3 (read-bits br 3)))
		    (setf (svref bigvec index) 0)
		    (incf index))
	   elseif (eq val 18)
	     then ; repeat zero a lot 
		  (dotimes (i (+ 11 (read-bits br 7)))
		    (setf (svref bigvec index) 0)
		    (incf index)))))
      
      (let (literal-length-huffman litlen-width
	    distance-huffman distance-width)
	(multiple-value-setq (literal-length-huffman litlen-width)
	  (generate-huffman-tree-from-vector bigvec 0 (+ hlit 257)))
      
	(multiple-value-setq (distance-huffman distance-width)
	  (generate-huffman-tree-from-vector bigvec (+ hlit 257) 
					     (length bigvec)))
      
	(process-huffman-block br op literal-length-huffman litlen-width
			       distance-huffman distance-width
			       buffer end)
	))))


(defun decode-huffman-tree (br tree minbits)
  ;; find the next huffman encoded value.
  ; the minimum length of a huffman code is minbits so 
  ; grab that many bits right away to speed processing and the
  ; go bit by bit until the answer is found
  (let ((startval (read-bits br minbits)))
    (dotimes (i minbits)
      (if* (logtest 1 startval)
	 then (setq tree (cdr tree))
	 else (setq tree (car tree)))
      (setq startval (ash startval -1)))
    (loop
      (if* (atom tree)
	 then (return tree)
	 else (if* (eql 1 (read-bits br 1))
		 then (setq tree (cdr tree))
		 else (setq tree (car tree)))))))


;;; test case...
;; Read file created with gzip and write the uncompressed version
;; to another file.  
;;
;; Porting note: the open below works on ACL since it creates
;;   a bivalent simple-stream.   If you run this on other lispsj
;;   you'll want to specify an :element-type of '(unsigned-byte 8)
;;
#+ignore
(defun testit (&optional (filename "foo.n.gz") (output-filename "out"))
  (with-open-file (p filename :direction :input)
    (skip-gzip-header p)
    (with-open-file (op output-filename :direction :output
		     :if-exists :supersede)
      (inflate p op))))