;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*- ;;; $Header: /usr/local/cvsrep/flexi-streams/input.lisp,v 1.51 2007/12/29 22:58:43 edi Exp $ ;;; Copyright (c) 2005-2007, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions ;;; are met: ;;; * Redistributions of source code must retain the above copyright ;;; notice, this list of conditions and the following disclaimer. ;;; * Redistributions in binary form must reproduce the above ;;; copyright notice, this list of conditions and the following ;;; disclaimer in the documentation and/or other materials ;;; provided with the distribution. ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (in-package :flexi-streams) #-:lispworks (defmethod read-byte* ((flexi-input-stream flexi-input-stream)) "Reads one byte \(octet) from the underlying stream of FLEXI-OUTPUT-STREAM \(or from the internal stack if it's not empty)." (declare (optimize speed)) ;; we're using S instead of STREAM here because of an ;; issue with SBCL: ;; (with-accessors ((position flexi-stream-position) (bound flexi-stream-bound) (octet-stack flexi-stream-octet-stack) (s flexi-stream-stream)) flexi-input-stream (declare (integer position) (type (or null integer) bound)) (when (and bound (>= position bound)) (return-from read-byte* nil)) (incf position) (or (pop octet-stack) (read-byte s nil nil) (progn (decf position) nil)))) #+:lispworks (defmethod read-byte* ((flexi-input-stream flexi-input-stream)) "Reads one byte \(octet) from the underlying stream of FLEXI-OUTPUT-STREAM \(or from the internal stack if it's not empty)." (declare (optimize speed)) (with-accessors ((position flexi-stream-position) (bound flexi-stream-bound) (octet-stack flexi-stream-octet-stack) (stream flexi-stream-stream)) flexi-input-stream (declare (integer position) (type (or null integer) bound)) (when (and bound (>= position bound)) (return-from read-byte* nil)) (incf position) (or (pop octet-stack) ;; we use READ-SEQUENCE because READ-BYTE doesn't work with all ;; bivalent streams in LispWorks (let* ((buffer (make-array 1 :element-type 'octet)) (new-position (read-sequence buffer stream))) (cond ((zerop new-position) (decf position) nil) (t (aref buffer 0))))))) #+:lispworks (defmethod read-byte* ((flexi-input-stream flexi-binary-input-stream)) "Reads one byte \(octet) from the underlying stream of FLEXI-OUTPUT-STREAM \(or from the internal stack if it's not empty). Optimized version \(only needed for LispWorks) in case the underlying stream is binary." (declare (optimize speed)) (with-accessors ((position flexi-stream-position) (bound flexi-stream-bound) (octet-stack flexi-stream-octet-stack) (stream flexi-stream-stream)) flexi-input-stream (declare (integer position) (type (or null integer) bound)) (when (and bound (>= position bound)) (return-from read-byte* nil)) (incf position) (or (pop octet-stack) (read-byte stream nil nil) (progn (decf position) nil)))) (defmethod stream-clear-input ((flexi-input-stream flexi-input-stream)) "Calls the corresponding method for the underlying input stream and also clears the value of the OCTET-STACK slot." (declare (optimize speed)) ;; note that we don't reset the POSITION slot (with-accessors ((octet-stack flexi-stream-octet-stack) (stream flexi-stream-stream)) flexi-input-stream (setq octet-stack nil) (clear-input stream))) (defmethod stream-listen ((flexi-input-stream flexi-input-stream)) "Calls the corresponding method for the underlying input stream but first checks if \(old) input is available in the OCTET-STACK slot." (declare (optimize speed)) (with-accessors ((position flexi-stream-position) (bound flexi-stream-bound) (octet-stack flexi-stream-octet-stack) (stream flexi-stream-stream)) flexi-input-stream (when (and bound (>= position bound)) (return-from stream-listen nil)) (or octet-stack (listen stream)))) (defmethod stream-read-byte ((stream flexi-input-stream)) "Reads one byte \(octet) from the underlying stream." (declare (optimize speed)) ;; set LAST-CHAR-CODE slot to NIL because we can't UNREAD-CHAR after ;; this operation (with-accessors ((last-char-code flexi-stream-last-char-code) (last-octet flexi-stream-last-octet)) stream (setq last-char-code nil) (let ((octet (read-byte* stream))) (setq last-octet octet) (or octet :eof)))) (defgeneric unread-char% (char-code flexi-input-stream) (:documentation "Used internally to put a character denoted by the character code CHAR-CODE which was already read back on the stream. Uses the OCTET-STACK slot and decrements the POSITION slot accordingly.")) (defmethod unread-char% (char-code (flexi-input-stream flexi-input-stream)) "The default method which is un-optimized and uses TRANSLATE-CHAR to figure out which octets to put on the octet stack." (declare (optimize speed) (inline translate-char)) (with-accessors ((position flexi-stream-position) (octet-stack flexi-stream-octet-stack) (external-format flexi-stream-external-format)) flexi-input-stream (declare (integer position)) (let ((octets-read (translate-char char-code external-format))) (decf position (length octets-read)) (setq octet-stack (append octets-read octet-stack))))) (defmethod unread-char% (char-code (flexi-input-stream flexi-latin-1-input-stream)) "For ISO-8859-1 we can simply put the character code itself on the octet stack." (declare (optimize speed)) (with-accessors ((position flexi-stream-position) (octet-stack flexi-stream-octet-stack)) flexi-input-stream (declare (integer position)) (decf position) (push char-code octet-stack))) (defmethod unread-char% (char-code (flexi-input-stream flexi-ascii-input-stream)) "For ASCII we can simply put the character code itself on the octet stack." (declare (optimize speed)) (with-accessors ((position flexi-stream-position) (octet-stack flexi-stream-octet-stack)) flexi-input-stream (declare (integer position)) (decf position) (push char-code octet-stack))) (defmethod unread-char% (char-code (flexi-input-stream flexi-8-bit-input-stream)) "For 8-bit encodings we just have to put one octet on the octet stack which we can look up in the encoding hash." (declare (optimize speed)) (with-accessors ((position flexi-stream-position) (octet-stack flexi-stream-octet-stack) (encoding-hash flexi-stream-encoding-hash)) flexi-input-stream (declare (integer position)) (decf position) (push (gethash char-code encoding-hash) octet-stack))) (defmethod unread-char% ((char-code (eql #.(char-code #\Newline))) (flexi-input-stream flexi-cr-8-bit-input-stream)) "A kind of `safety net' for the optimized 8-bit versions of UNREAD-CHAR% which checks for the single case where more than one octet has to be put on the octet stack." (declare (optimize speed)) (with-accessors ((position flexi-stream-position) (octet-stack flexi-stream-octet-stack) (external-format flexi-stream-external-format)) flexi-input-stream (declare (integer position)) ;; note that below we use the knowledge that in all 8-bit encodings ;; #\Return and #\Linefeed are mapped to the same octets (case (external-format-eol-style external-format) (:crlf (decf position 2) (push #.(char-code #\Linefeed) octet-stack) (push #.(char-code #\Return) octet-stack)) (otherwise (decf position) (push #.(char-code #\Return) octet-stack))))) #+:lispworks (defmethod unread-char% ((char-code (eql #.(char-code #\Newline))) (flexi-input-stream flexi-binary-cr-8-bit-input-stream)) "A kind of `safety net' for the optimized 8-bit versions of UNREAD-CHAR% which checks for the single case where more than one octet has to be put on the octet stack. This method \(identical to the one defined directly above) exists only for LispWorks' \"binary\" streams and must be there due to the slightly clunky class hierarchy." (declare (optimize speed)) (with-accessors ((position flexi-stream-position) (octet-stack flexi-stream-octet-stack) (external-format flexi-stream-external-format)) flexi-input-stream (declare (integer position)) ;; note that below we use the knowledge that in all 8-bit encodings ;; #\Return and #\Linefeed are mapped to the same octets (case (external-format-eol-style external-format) (:crlf (decf position 2) (push #.(char-code #\Linefeed) octet-stack) (push #.(char-code #\Return) octet-stack)) (otherwise (decf position) (push #.(char-code #\Return) octet-stack))))) (defmacro define-char-reader ((stream-var stream-class) &body body) "Helper macro to define methods for STREAM-READ-CHAR. Defines a method for the class STREAM-CLASS using the variable STREAM-VAR and the code body BODY wrapped with some standard code common to all methods defined here. The return value of BODY is a character code. In case of encoding problems, BODY must return the value returned by \(RECOVER-FROM-ENCODING-ERROR ...)." (with-unique-names (char-code body-fn) `(defmethod stream-read-char ((,stream-var ,stream-class)) "This method was generated with the DEFINE-CHAR-READER macro." (declare (optimize speed)) ;; note that we do nothing for the :LF EOL style because we ;; assume that #\Newline is the same as #\Linefeed in all ;; Lisps which will use this library (with-accessors ((last-octet flexi-stream-last-octet) (last-char-code flexi-stream-last-char-code)) ,stream-var ;; set LAST-OCTET slot to NIL because we can't UNREAD-BYTE after ;; this operation (setq last-octet nil) (let ((,char-code (flet ((,body-fn () ,@body)) (declare (inline ,body-fn) (dynamic-extent (function ,body-fn))) (,body-fn)))) ;; remember this character and the current external format ;; for UNREAD-CHAR (setq last-char-code ,char-code) (or (code-char ,char-code) ,char-code)))))) (defun recover-from-encoding-error (flexi-stream format-control &rest format-args) "Helper function used by the STREAM-READ-CHAR methods below to deal with encoding errors. Checks if *SUBSTITUTION-CHAR* is not NIL and returns its character code in this case. Otherwise signals a FLEXI-STREAM-ENCODING-ERROR as determined by the arguments to this function and provides a corresponding USE-VALUE restart." (when *substitution-char* (return-from recover-from-encoding-error (char-code *substitution-char*))) (restart-case (apply #'signal-encoding-error flexi-stream format-control format-args) (use-value (char) :report "Specify a character to be used instead." :interactive (lambda () (loop (format *query-io* "Type a character: ") (let ((line (read-line *query-io*))) (when (= 1 (length line)) (return (list (char line 0))))))) (char-code char)))) (define-char-reader (stream flexi-latin-1-input-stream) (or (read-byte* stream) (return-from stream-read-char :eof))) (define-char-reader (stream flexi-ascii-input-stream) (let ((octet (or (read-byte* stream) (return-from stream-read-char :eof)))) (declare (type octet octet)) (if (> octet 127) (recover-from-encoding-error stream "No character which corresponds to octet #x~X." octet) octet))) (define-char-reader (stream flexi-8-bit-input-stream) (with-accessors ((encoding-table flexi-stream-encoding-table)) stream (let* ((octet (or (read-byte* stream) (return-from stream-read-char :eof))) (char-code (aref (the (simple-array * *) encoding-table) octet))) (declare (type octet octet)) (if (or (null char-code) (= char-code 65533)) (recover-from-encoding-error stream "No character which corresponds to octet #x~X." octet) char-code)))) (define-char-reader (stream flexi-utf-8-input-stream) (block body (let (first-octet-seen) (flet ((read-next-byte () (prog1 (or (read-byte* stream) (cond (first-octet-seen (return-from body (recover-from-encoding-error stream "End of file while in UTF-8 sequence."))) (t (return-from stream-read-char :eof)))) (setq first-octet-seen t)))) (declare (inline read-next-byte) (dynamic-extent (function read-next-byte))) (let ((octet (read-next-byte))) (declare (type octet octet)) (multiple-value-bind (start count) (cond ((zerop (logand octet #b10000000)) (values octet 0)) ((= #b11000000 (logand octet #b11100000)) (values (logand octet #b00011111) 1)) ((= #b11100000 (logand octet #b11110000)) (values (logand octet #b00001111) 2)) ((= #b11110000 (logand octet #b11111000)) (values (logand octet #b00000111) 3)) ((= #b11111000 (logand octet #b11111100)) (values (logand octet #b00000011) 4)) ((= #b11111100 (logand octet #b11111110)) (values (logand octet #b00000001) 5)) (t (return-from body (recover-from-encoding-error stream "Unexpected value #x~X at start of UTF-8 sequence." octet)))) ;; note that we currently don't check for "overlong" ;; sequences or other illegal values (loop for result of-type (unsigned-byte 32) = start then (+ (ash result 6) (logand octet #b111111)) repeat count for octet of-type octet = (read-next-byte) unless (= #b10000000 (logand octet #b11000000)) do (return-from body (recover-from-encoding-error stream "Unexpected value #x~X in UTF-8 sequence." octet)) finally (return result)))))))) (define-char-reader (stream flexi-utf-16-le-input-stream) (block body (let (first-octet-seen) (labels ((read-next-byte () (prog1 (or (read-byte* stream) (cond (first-octet-seen (return-from body (recover-from-encoding-error stream "End of file while in UTF-16 sequence."))) (t (return-from stream-read-char :eof)))) (setq first-octet-seen t))) (read-next-word () (+ (the octet (read-next-byte)) (ash (the octet (read-next-byte)) 8)))) (declare (inline read-next-byte read-next-word) (dynamic-extent (function read-next-byte) (function read-next-word))) (let ((word (read-next-word))) (cond ((<= #xd800 word #xdfff) (let ((next-word (read-next-word))) (unless (<= #xdc00 next-word #xdfff) (return-from body (recover-from-encoding-error stream "Unexpected UTF-16 word #x~X following #x~X." next-word word))) (+ (ash (logand #b1111111111 word) 10) (logand #b1111111111 next-word) #x10000))) (t word))))))) (define-char-reader (stream flexi-utf-16-be-input-stream) (block body (let (first-octet-seen) (labels ((read-next-byte () (prog1 (or (read-byte* stream) (cond (first-octet-seen (return-from body (recover-from-encoding-error stream "End of file while in UTF-16 sequence."))) (t (return-from stream-read-char :eof)))) (setq first-octet-seen t))) (read-next-word () (+ (ash (the octet (read-next-byte)) 8) (the octet (read-next-byte))))) (let ((word (read-next-word))) (cond ((<= #xd800 word #xdfff) (let ((next-word (read-next-word))) (unless (<= #xdc00 next-word #xdfff) (return-from body (recover-from-encoding-error stream "Unexpected UTF-16 word #x~X following #x~X." next-word word))) (+ (ash (logand #b1111111111 word) 10) (logand #b1111111111 next-word) #x10000))) (t word))))))) (define-char-reader (stream flexi-utf-32-le-input-stream) (block body (let (first-octet-seen) (flet ((read-next-byte () (prog1 (or (read-byte* stream) (cond (first-octet-seen (return-from body (recover-from-encoding-error stream "End of file while in UTF-32 sequence."))) (t (return-from stream-read-char :eof)))) (setq first-octet-seen t)))) (declare (inline read-next-byte) (dynamic-extent (function read-next-byte))) (loop for count from 0 to 24 by 8 for octet of-type octet = (read-next-byte) sum (ash octet count)))))) (define-char-reader (stream flexi-utf-32-be-input-stream) (block body (let (first-octet-seen) (flet ((read-next-byte () (prog1 (or (read-byte* stream) (cond (first-octet-seen (return-from body (recover-from-encoding-error stream "End of file while in UTF-32 sequence."))) (t (return-from stream-read-char :eof)))) (setq first-octet-seen t)))) (declare (inline read-next-byte) (dynamic-extent (function read-next-byte))) (loop for count from 24 downto 0 by 8 for octet of-type octet = (read-next-byte) sum (ash octet count)))))) (defmethod stream-read-char ((stream flexi-cr-mixin)) "The `base' method for all streams which need end-of-line conversion. Uses CALL-NEXT-METHOD to do the actual work of reading one or more characters from the stream." (declare (optimize speed)) (let ((char (call-next-method))) (when (eq char :eof) (return-from stream-read-char :eof)) (with-accessors ((external-format flexi-stream-external-format) (last-char-code flexi-stream-last-char-code)) stream (when (eql char #\Return) (case (external-format-eol-style external-format) (:cr (setq char #\Newline last-char-code #.(char-code #\Newline))) ;; in the case :CRLF we have to look ahead one character (:crlf (let ((next-char (call-next-method))) (case next-char (#\Linefeed (setq char #\Newline last-char-code #.(char-code #\Newline))) (:eof) ;; if the character we peeked at wasn't a ;; linefeed character we push its ;; constituents back onto our internal ;; octet stack (otherwise (unread-char% (char-code next-char) stream))))))) char))) (defmethod stream-read-char-no-hang ((stream flexi-input-stream)) "Reads one character if the underlying stream has at least one octet available." (declare (optimize speed)) ;; note that this may block for non-8-bit encodings - I think ;; there's no easy way to handle this correctly (and (stream-listen stream) (stream-read-char stream))) (defmethod stream-read-sequence ((flexi-input-stream flexi-input-stream) sequence start end &key) "Reads enough input from STREAM to fill SEQUENCE from START to END. If SEQUENCE is an array which can store octets we use READ-SEQUENCE to fill it in one fell swoop, otherwise we iterate using STREAM-READ-CHAR." (declare (optimize speed) (type (integer 0 *) start end)) (with-accessors ((last-char-code flexi-stream-last-char-code) (last-octet flexi-stream-last-octet) (stream flexi-stream-stream) (position flexi-stream-position) (octet-stack flexi-stream-octet-stack)) flexi-input-stream (declare (integer position)) (cond ((and (arrayp sequence) (subtypep 'octet (array-element-type sequence))) (setf last-char-code nil) (let ((cursor start)) (loop with stack = octet-stack for continuep = (< cursor end) for octet = (and continuep (pop stack)) while octet do (setf (aref sequence cursor) (the octet octet)) (incf cursor)) (let ((index (read-sequence sequence stream :start cursor :end end))) (incf position (- index start)) (when (> index start) (setq last-octet (aref sequence (1- index)))) index))) (t (loop for index from start below end for element = (stream-read-char flexi-input-stream) until (eq element :eof) do (setf (elt sequence index) element) finally (return index)))))) (defmethod stream-unread-char ((stream flexi-input-stream) char) "Implements UNREAD-CHAR for streams of type FLEXI-INPUT-STREAM. Makes sure CHAR will only be unread if it was the last character read and if it was read with the same encoding that's currently being used by the stream." (declare (optimize speed)) (with-accessors ((last-char-code flexi-stream-last-char-code)) stream (unless last-char-code (error 'flexi-stream-simple-error :format-control "No character to unread from this stream \(or external format has changed or last reading operation was binary).")) (unless (= (char-code char) last-char-code) (error 'flexi-stream-simple-error :format-control "Last character read (~S) was different from ~S." :format-arguments (list (code-char last-char-code) char))) (unread-char% last-char-code stream) (setq last-char-code nil) nil)) (defmethod unread-byte (byte (flexi-input-stream flexi-input-stream)) "Similar to UNREAD-CHAR in that it `unreads' the last octet from STREAM. Note that you can only call UNREAD-BYTE after a corresponding READ-BYTE." (declare (optimize speed)) (with-accessors ((last-octet flexi-stream-last-octet) (octet-stack flexi-stream-octet-stack) (position flexi-stream-position)) flexi-input-stream (unless last-octet (error 'flexi-stream-simple-error :format-control "No byte to unread from this stream \(or last reading operation read a character).")) (unless (= byte last-octet) (error 'flexi-stream-simple-error :format-control "Last byte read was different from #x~X." :format-arguments (list byte))) (setq last-octet nil) (decf (the integer position)) (push byte octet-stack) nil)) (defmethod peek-byte ((flexi-input-stream flexi-input-stream) &optional peek-type (eof-error-p t) eof-value) "PEEK-BYTE is like PEEK-CHAR, i.e. it returns an octet from FLEXI-INPUT-STREAM without actually removing it. If PEEK-TYPE is NIL the next octet is returned, if PEEK-TYPE is T, the next octet which is not 0 is returned, if PEEK-TYPE is an octet, the next octet which equals PEEK-TYPE is returned. EOF-ERROR-P and EOF-VALUE are interpreted as usual." (declare (optimize speed)) (loop for octet = (read-byte flexi-input-stream eof-error-p eof-value) until (cond ((null peek-type)) ((eql octet eof-value)) ((eq peek-type t) (plusp octet)) (t (= octet peek-type))) finally (unless (eql octet eof-value) (unread-byte octet flexi-input-stream)) (return octet)))