4747import org .xml .sax .SAXException ;
4848
4949/**
50- * This abstract class is a base class for other stream
51- * serializers (xml, html, text ...) that write output to a stream.
50+ * This abstract class is a base class for other stream serializers
51+ * (xml, html, text ...) that write output to a stream. Note that
52+ * this is stateful, NOT designed to be multithreaded; each thread and
53+ * each output stream should have its own instance.
5254 *
5355 * @xsl.usage internal
5456 */
@@ -61,7 +63,6 @@ abstract public class ToStream extends SerializerBase
6163 /** Stack to keep track of disabling output escaping. */
6264 protected BoolStack m_disableOutputEscapingStates = new BoolStack ();
6365
64-
6566 /**
6667 * The encoding information associated with this serializer.
6768 * Although initially there is no encoding,
@@ -174,7 +175,40 @@ abstract public class ToStream extends SerializerBase
174175 * which is exiting older behavior.
175176 */
176177 private boolean m_expandDTDEntities = true ;
177-
178+
179+ /**
180+ * Traditionally, we handled Surrogate Character Pairs by looking
181+ * ahead in the input buffer. This could fail if, eg, the pair crossed
182+ * between one call to characters() and the next, which can happen
183+ * since SAX providers are free to manage buffering as they see fit
184+ * and what the XML Data Model considers a single block of text
185+ * may be delivered in multiple calls.
186+ *
187+ * The more robust solution is to maintain state, setting the High
188+ * UTF16 Surrogate character aside and processing it when the Low
189+ * Surrogate arrives.
190+ *
191+ * However, handling this robustly this requires recognizing, and
192+ * handling, cases where a Surrogate appears but is not adjacent to
193+ * the other half of the pair. That's illegal UTF16, but as utility
194+ * code we can't guarantee some caller won't attempt it.
195+ *
196+ * Historically, we have handled this one of two ways, either
197+ * generating an IOException with ER_INVALID_UTF18_SURROGATE or
198+ * outputting the bad surrogate as a Numeric Character Reference
199+ * (and possibly issuing a message to stderr, as in ToTextStream).
200+ * The inconsistency annoys me a bit. Only SGML-based formats
201+ * support NCRs, and XML explicitly says that even an NCR may not
202+ * represent an isolated surrogate. Hence, for correctness, we AT
203+ * LEAST want the stderr message, and arguably should be throwing
204+ * the exception. However, if we change any of this behavior we
205+ * want to be able to revert to the prior response, in case some
206+ * user is actually expecting to see that.
207+ *
208+ * Note that since we process char arrays, the "pending high surrogate"
209+ * buffer is a char, with 0 used to indicate "empty buffer".
210+ */
211+ private char m_pendingUTF16HighSurrogate = 0 ;
178212
179213 /**
180214 * Default constructor
@@ -959,67 +993,69 @@ protected boolean escapingNotNeeded(char ch)
959993 /**
960994 * Once a surrogate has been detected, write out the pair of
961995 * characters if it is in the encoding, or if there is no
962- * encoding, otherwise write out an entity reference
996+ * encoding, otherwise write out an numeric character reference
963997 * of the value of the unicode code point of the character
964998 * represented by the high/low surrogate pair.
965999 * <p>
9661000 * An exception is thrown if there is no low surrogate in the pair,
9671001 * because the array ends unexpectely, or if the low char is there
9681002 * but its value is such that it is not a low surrogate.
9691003 *
970- * @param c the first (high) part of the surrogate, which
1004+ * @param high the first (high) part of the surrogate, which
9711005 * must be confirmed before calling this method.
9721006 * @param ch Character array.
9731007 * @param i position Where the surrogate was detected.
9741008 * @param end The end index of the significant characters.
9751009 * @return 0 if the pair of characters was written out as-is,
9761010 * the unicode code point of the character represented by
977- * the surrogate pair if an entity reference with that value
1011+ * the surrogate pair if a numeric char ref with that value
9781012 * was written out.
9791013 *
9801014 * @throws IOException if invalid UTF-16 surrogate detected.
9811015 */
982- protected int writeUTF16Surrogate (char c , char ch [], int i , int end )
1016+ protected int writeUTF16Surrogate (final char high , char ch [], int i , int end )
9831017 throws IOException
9841018 {
985- int codePoint = 0 ;
1019+ // THROWS if surrogate pair crosses input buffers
1020+ // Should probably handle this better.
9861021 if (i + 1 >= end )
9871022 {
9881023 throw new IOException (
9891024 Utils .messages .createMessage (
9901025 MsgKey .ER_INVALID_UTF16_SURROGATE ,
991- new Object [] { Integer .toHexString ((int ) c )}));
1026+ new Object [] { Integer .toHexString ((int ) high )}));
9921027 }
9931028
994- final char high = c ;
9951029 final char low = ch [i +1 ];
9961030 if (!Encodings .isLowUTF16Surrogate (low )) {
9971031 throw new IOException (
9981032 Utils .messages .createMessage (
9991033 MsgKey .ER_INVALID_UTF16_SURROGATE ,
10001034 new Object [] {
1001- Integer .toHexString ((int ) c )
1035+ Integer .toHexString ((int ) high )
10021036 + " "
10031037 + Integer .toHexString (low )}));
10041038 }
10051039
10061040 final java .io .Writer writer = m_writer ;
1041+ int codePoint = 0 ; // Nonzero iff written as NCR
10071042
10081043 // If we make it to here we have a valid high, low surrogate pair
1009- if (m_encodingInfo .isInEncoding (c ,low )) {
1044+ if (m_encodingInfo .isInEncoding (high ,low )) {
10101045 // If the character formed by the surrogate pair
10111046 // is in the encoding, so just write it out
1047+ // NOTE: Assumes same buffer
10121048 writer .write (ch ,i ,2 );
10131049 }
10141050 else {
10151051 // Don't know what to do with this char, it is
10161052 // not in the encoding and not a high char in
1017- // a surrogate pair, so write out as an entity ref
1053+ // a surrogate pair, so write out as a numeric char ref
10181054 final String encoding = getEncoding ();
10191055 if (encoding != null ) {
10201056 /* The output encoding is known,
10211057 * so somthing is wrong.
1022- */
1058+ */
10231059 codePoint = Encodings .toCodePoint (high , low );
10241060 // not in the encoding, so write out a character reference
10251061 writer .write ('&' );
@@ -1033,7 +1069,10 @@ protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
10331069 writer .write (ch , i , 2 );
10341070 }
10351071 }
1036- // non-zero only if character reference was written out.
1072+
1073+ // ToTextStream tests this and issues an error message (but
1074+ // not exception) if the not-in-encoding case arises,
1075+ // outputting an NCR in passing.
10371076 return codePoint ;
10381077 }
10391078
0 commit comments