;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*- ;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.16 2008/05/20 23:01:50 edi Exp $ ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions ;;; are met: ;;; * Redistributions of source code must retain the above copyright ;;; notice, this list of conditions and the following disclaimer. ;;; * Redistributions in binary form must reproduce the above ;;; copyright notice, this list of conditions and the following ;;; disclaimer in the documentation and/or other materials ;;; provided with the distribution. ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (in-package :flexi-streams) (defun recover-from-encoding-error (external-format format-control &rest format-args) "Helper function used by OCTETS-TO-CHAR-CODE below to deal with encoding errors. Checks if *SUBSTITUTION-CHAR* is not NIL and returns its character code in this case. Otherwise signals an EXTERNAL-FORMAT-ENCODING-ERROR as determined by the arguments to this function and provides a corresponding USE-VALUE restart." (when *substitution-char* (return-from recover-from-encoding-error (char-code *substitution-char*))) (restart-case (apply #'signal-encoding-error external-format format-control format-args) (use-value (char) :report "Specify a character to be used instead." :interactive (lambda () (loop (format *query-io* "Type a character: ") (let ((line (read-line *query-io*))) (when (= 1 (length line)) (return (list (char line 0))))))) (char-code char)))) (defgeneric octets-to-char-code (format reader) (declare #.*standard-optimize-settings*) (:documentation "Converts a sequence of octets to a character code \(which is returned, or NIL in case of EOF) using the external format FORMAT. The sequence is obtained by calling the function \(which must be a functional object) READER with no arguments which should return one octet per call. In the case of EOF, READER should return NIL. The special variable *CURRENT-UNREADER* must be bound correctly whenever this function is called.")) (defmethod octets-to-char-code ((format flexi-latin-1-format) reader) (declare #.*fixnum-optimize-settings*) (declare (function reader)) (funcall reader)) (defmethod octets-to-char-code ((format flexi-ascii-format) reader) (declare #.*fixnum-optimize-settings*) (declare (function reader)) (when-let (octet (funcall reader)) (if (> (the octet octet) 127) (recover-from-encoding-error format "No character which corresponds to octet #x~X." octet) octet))) (defmethod octets-to-char-code ((format flexi-8-bit-format) reader) (declare #.*fixnum-optimize-settings*) (declare (function reader)) (with-accessors ((decoding-table external-format-decoding-table)) format (when-let (octet (funcall reader)) (let ((char-code (aref (the (simple-array char-code-integer *) decoding-table) (the octet octet)))) (if (or (null char-code) (= (the char-code-integer char-code) 65533)) (recover-from-encoding-error format "No character which corresponds to octet #x~X." octet) char-code))))) (defmethod octets-to-char-code ((format flexi-utf-8-format) reader) (declare #.*fixnum-optimize-settings*) (declare (function reader)) (let (first-octet-seen) (declare (boolean first-octet-seen)) (macrolet ((read-next-byte () '(prog1 (or (funcall reader) (cond (first-octet-seen (return-from octets-to-char-code (recover-from-encoding-error format "End of data while in UTF-8 sequence."))) (t (return-from octets-to-char-code nil)))) (setq first-octet-seen t)))) (let ((octet (read-next-byte))) (declare (type octet octet)) (multiple-value-bind (start count) (cond ((not (logbitp 7 octet)) (values octet 0)) ((= #b11000000 (logand octet #b11100000)) (values (logand octet #b00011111) 1)) ((= #b11100000 (logand octet #b11110000)) (values (logand octet #b00001111) 2)) ((= #b11110000 (logand octet #b11111000)) (values (logand octet #b00000111) 3)) ((= #b11111000 (logand octet #b11111100)) (values (logand octet #b00000011) 4)) ((= #b11111100 (logand octet #b11111110)) (values (logand octet #b00000001) 5)) (t (return-from octets-to-char-code (recover-from-encoding-error format "Unexpected value #x~X at start of UTF-8 sequence." octet)))) (declare (fixnum count)) ;; note that we currently don't check for "overlong" ;; sequences or other illegal values (loop for result of-type code-point = start then (+ (ash result 6) (logand octet #b111111)) repeat count for octet of-type octet = (read-next-byte) unless (= #b10000000 (logand octet #b11000000)) do (return-from octets-to-char-code (recover-from-encoding-error format "Unexpected value #x~X in UTF-8 sequence." octet)) finally (return result))))))) (defmethod octets-to-char-code ((format flexi-utf-16-le-format) reader) (declare #.*fixnum-optimize-settings*) (declare (function reader)) (let (first-octet-seen) (declare (boolean first-octet-seen)) (macrolet ((read-next-byte () '(prog1 (or (funcall reader) (cond (first-octet-seen (return-from octets-to-char-code (recover-from-encoding-error format "End of data while in UTF-16 sequence."))) (t (return-from octets-to-char-code nil)))) (setq first-octet-seen t)))) (flet ((read-next-word () (+ (the octet (read-next-byte)) (ash (the octet (read-next-byte)) 8)))) (declare (inline read-next-word)) (let ((word (read-next-word))) (declare (type (unsigned-byte 16) word)) (cond ((<= #xd800 word #xdfff) (let ((next-word (read-next-word))) (declare (type (unsigned-byte 16) next-word)) (unless (<= #xdc00 next-word #xdfff) (return-from octets-to-char-code (recover-from-encoding-error format "Unexpected UTF-16 word #x~X following #x~X." next-word word))) (+ (ash (logand #b1111111111 word) 10) (logand #b1111111111 next-word) #x10000))) (t word))))))) (defmethod octets-to-char-code ((format flexi-utf-16-be-format) reader) (declare #.*fixnum-optimize-settings*) (declare (function reader)) (let (first-octet-seen) (declare (boolean first-octet-seen)) (macrolet ((read-next-byte () '(prog1 (or (funcall reader) (cond (first-octet-seen (return-from octets-to-char-code (recover-from-encoding-error format "End of data while in UTF-16 sequence."))) (t (return-from octets-to-char-code nil)))) (setq first-octet-seen t)))) (flet ((read-next-word () (+ (ash (the octet (read-next-byte)) 8) (the octet (read-next-byte))))) (declare (inline read-next-word)) (let ((word (read-next-word))) (declare (type (unsigned-byte 16) word)) (cond ((<= #xd800 word #xdfff) (let ((next-word (read-next-word))) (declare (type (unsigned-byte 16) next-word)) (unless (<= #xdc00 next-word #xdfff) (return-from octets-to-char-code (recover-from-encoding-error format "Unexpected UTF-16 word #x~X following #x~X." next-word word))) (+ (ash (logand #b1111111111 word) 10) (logand #b1111111111 next-word) #x10000))) (t word))))))) (defmethod octets-to-char-code ((format flexi-utf-32-le-format) reader) (declare #.*fixnum-optimize-settings*) (declare (function reader)) (let (first-octet-seen) (declare (boolean first-octet-seen)) (macrolet ((read-next-byte () '(prog1 (or (funcall reader) (cond (first-octet-seen (return-from octets-to-char-code (recover-from-encoding-error format "End of data while in UTF-32 sequence."))) (t (return-from octets-to-char-code nil)))) (setq first-octet-seen t)))) (loop for count of-type fixnum from 0 to 24 by 8 for octet of-type octet = (read-next-byte) sum (ash octet count))))) (defmethod octets-to-char-code ((format flexi-utf-32-be-format) reader) (declare #.*fixnum-optimize-settings*) (declare (function reader)) (let (first-octet-seen) (declare (boolean first-octet-seen)) (macrolet ((read-next-byte () '(prog1 (or (funcall reader) (cond (first-octet-seen (return-from octets-to-char-code (recover-from-encoding-error format "End of data while in UTF-32 sequence."))) (t (return-from octets-to-char-code nil)))) (setq first-octet-seen t)))) (loop for count of-type fixnum from 24 downto 0 by 8 for octet of-type octet = (read-next-byte) sum (ash octet count))))) (defmethod octets-to-char-code ((format flexi-cr-mixin) reader) (declare #.*fixnum-optimize-settings*) (declare (ignore reader)) (let ((char-code (call-next-method))) (case char-code (#.(char-code #\Return) #.(char-code #\Newline)) (otherwise char-code)))) (defmethod octets-to-char-code ((format flexi-crlf-mixin) reader) (declare #.*fixnum-optimize-settings*) (declare (function *current-unreader*)) (declare (ignore reader)) (let ((char-code (call-next-method))) (case char-code (#.(char-code #\Return) (let ((next-char-code (call-next-method))) (case next-char-code (#.(char-code #\Linefeed) #.(char-code #\Newline)) ;; we saw a CR but no LF afterwards, but then the data ;; ended, so we just return #\Return ((nil) #.(char-code #\Return)) ;; if the character we peeked at wasn't a ;; linefeed character we unread its constituents (otherwise (funcall *current-unreader* (code-char next-char-code)) char-code)))) (otherwise char-code))))