Skip to content

Commit 162e1f0

Browse files
just documentation/parameter names
1 parent d83b90e commit 162e1f0

2 files changed

Lines changed: 57 additions & 17 deletions

File tree

serializer/src/main/java/org/apache/xml/serializer/ToStream.java

Lines changed: 55 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,10 @@
4747
import org.xml.sax.SAXException;
4848

4949
/**
50-
* This abstract class is a base class for other stream
51-
* serializers (xml, html, text ...) that write output to a stream.
50+
* This abstract class is a base class for other stream serializers
51+
* (xml, html, text ...) that write output to a stream. Note that
52+
* this is stateful, NOT designed to be multithreaded; each thread and
53+
* each output stream should have its own instance.
5254
*
5355
* @xsl.usage internal
5456
*/
@@ -61,7 +63,6 @@ abstract public class ToStream extends SerializerBase
6163
/** Stack to keep track of disabling output escaping. */
6264
protected BoolStack m_disableOutputEscapingStates = new BoolStack();
6365

64-
6566
/**
6667
* The encoding information associated with this serializer.
6768
* Although initially there is no encoding,
@@ -174,7 +175,40 @@ abstract public class ToStream extends SerializerBase
174175
* which is exiting older behavior.
175176
*/
176177
private boolean m_expandDTDEntities = true;
177-
178+
179+
/**
180+
* Traditionally, we handled Surrogate Character Pairs by looking
181+
* ahead in the input buffer. This could fail if, eg, the pair crossed
182+
* between one call to characters() and the next, which can happen
183+
* since SAX providers are free to manage buffering as they see fit
184+
* and what the XML Data Model considers a single block of text
185+
* may be delivered in multiple calls.
186+
*
187+
* The more robust solution is to maintain state, setting the High
188+
* UTF16 Surrogate character aside and processing it when the Low
189+
* Surrogate arrives.
190+
*
191+
* However, handling this robustly this requires recognizing, and
192+
* handling, cases where a Surrogate appears but is not adjacent to
193+
* the other half of the pair. That's illegal UTF16, but as utility
194+
* code we can't guarantee some caller won't attempt it.
195+
*
196+
* Historically, we have handled this one of two ways, either
197+
* generating an IOException with ER_INVALID_UTF18_SURROGATE or
198+
* outputting the bad surrogate as a Numeric Character Reference
199+
* (and possibly issuing a message to stderr, as in ToTextStream).
200+
* The inconsistency annoys me a bit. Only SGML-based formats
201+
* support NCRs, and XML explicitly says that even an NCR may not
202+
* represent an isolated surrogate. Hence, for correctness, we AT
203+
* LEAST want the stderr message, and arguably should be throwing
204+
* the exception. However, if we change any of this behavior we
205+
* want to be able to revert to the prior response, in case some
206+
* user is actually expecting to see that.
207+
*
208+
* Note that since we process char arrays, the "pending high surrogate"
209+
* buffer is a char, with 0 used to indicate "empty buffer".
210+
*/
211+
private char m_pendingUTF16HighSurrogate = 0;
178212

179213
/**
180214
* Default constructor
@@ -959,67 +993,69 @@ protected boolean escapingNotNeeded(char ch)
959993
/**
960994
* Once a surrogate has been detected, write out the pair of
961995
* characters if it is in the encoding, or if there is no
962-
* encoding, otherwise write out an entity reference
996+
* encoding, otherwise write out an numeric character reference
963997
* of the value of the unicode code point of the character
964998
* represented by the high/low surrogate pair.
965999
* <p>
9661000
* An exception is thrown if there is no low surrogate in the pair,
9671001
* because the array ends unexpectely, or if the low char is there
9681002
* but its value is such that it is not a low surrogate.
9691003
*
970-
* @param c the first (high) part of the surrogate, which
1004+
* @param high the first (high) part of the surrogate, which
9711005
* must be confirmed before calling this method.
9721006
* @param ch Character array.
9731007
* @param i position Where the surrogate was detected.
9741008
* @param end The end index of the significant characters.
9751009
* @return 0 if the pair of characters was written out as-is,
9761010
* the unicode code point of the character represented by
977-
* the surrogate pair if an entity reference with that value
1011+
* the surrogate pair if a numeric char ref with that value
9781012
* was written out.
9791013
*
9801014
* @throws IOException if invalid UTF-16 surrogate detected.
9811015
*/
982-
protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
1016+
protected int writeUTF16Surrogate(final char high, char ch[], int i, int end)
9831017
throws IOException
9841018
{
985-
int codePoint = 0;
1019+
// THROWS if surrogate pair crosses input buffers
1020+
// Should probably handle this better.
9861021
if (i + 1 >= end)
9871022
{
9881023
throw new IOException(
9891024
Utils.messages.createMessage(
9901025
MsgKey.ER_INVALID_UTF16_SURROGATE,
991-
new Object[] { Integer.toHexString((int) c)}));
1026+
new Object[] { Integer.toHexString((int) high)}));
9921027
}
9931028

994-
final char high = c;
9951029
final char low = ch[i+1];
9961030
if (!Encodings.isLowUTF16Surrogate(low)) {
9971031
throw new IOException(
9981032
Utils.messages.createMessage(
9991033
MsgKey.ER_INVALID_UTF16_SURROGATE,
10001034
new Object[] {
1001-
Integer.toHexString((int) c)
1035+
Integer.toHexString((int) high)
10021036
+ " "
10031037
+ Integer.toHexString(low)}));
10041038
}
10051039

10061040
final java.io.Writer writer = m_writer;
1041+
int codePoint = 0; // Nonzero iff written as NCR
10071042

10081043
// If we make it to here we have a valid high, low surrogate pair
1009-
if (m_encodingInfo.isInEncoding(c,low)) {
1044+
if (m_encodingInfo.isInEncoding(high,low)) {
10101045
// If the character formed by the surrogate pair
10111046
// is in the encoding, so just write it out
1047+
// NOTE: Assumes same buffer
10121048
writer.write(ch,i,2);
10131049
}
10141050
else {
10151051
// Don't know what to do with this char, it is
10161052
// not in the encoding and not a high char in
1017-
// a surrogate pair, so write out as an entity ref
1053+
// a surrogate pair, so write out as a numeric char ref
10181054
final String encoding = getEncoding();
10191055
if (encoding != null) {
10201056
/* The output encoding is known,
10211057
* so somthing is wrong.
1022-
*/
1058+
*/
10231059
codePoint = Encodings.toCodePoint(high, low);
10241060
// not in the encoding, so write out a character reference
10251061
writer.write('&');
@@ -1033,7 +1069,10 @@ protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
10331069
writer.write(ch, i, 2);
10341070
}
10351071
}
1036-
// non-zero only if character reference was written out.
1072+
1073+
// ToTextStream tests this and issues an error message (but
1074+
// not exception) if the not-in-encoding case arises,
1075+
// outputting an NCR in passing.
10371076
return codePoint;
10381077
}
10391078

serializer/src/main/java/org/apache/xml/serializer/ToTextStream.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ void writeNormalizedChars(
291291
if (codePoint != 0) {
292292
// I think we can just emit the message,
293293
// not crash and burn.
294+
// Git commit ffb244aaa0f88368a0bf483bddc7e74d8a4d83bf?
294295
final String integralValue = Integer.toString(codePoint);
295296
final String msg = Utils.messages.createMessage(
296297
MsgKey.ER_ILLEGAL_CHARACTER,
@@ -306,7 +307,7 @@ void writeNormalizedChars(
306307
} else {
307308
// Don't know what to do with this char, it is
308309
// not in the encoding and not a high char in
309-
// a surrogate pair, so write out as an entity ref
310+
// a surrogate pair, so write out as numeric char ref
310311
if (encoding != null) {
311312
/* The output encoding is known,
312313
* so somthing is wrong.

0 commit comments

Comments
 (0)