@@ -1027,6 +1027,34 @@ protected int writeUTF16Surrogate(final char high, char ch[], int i, int end)
10271027 }
10281028
10291029 final char low = ch [i +1 ];
1030+ return writeUTF16Surrogate (high , low );
1031+ }
1032+
1033+
1034+ /**
1035+ * Once a surrogate has been detected, write out the pair of
1036+ * characters if it is in the encoding, or if there is no
1037+ * encoding, otherwise write out an numeric character reference
1038+ * of the value of the unicode code point of the character
1039+ * represented by the high/low surrogate pair.
1040+ * <p>
1041+ * An exception is thrown if there is no low surrogate in the pair,
1042+ * because the array ends unexpectely, or if the low char is there
1043+ * but its value is such that it is not a low surrogate.
1044+ *
1045+ * @param high the first (high) part of the surrogate, which
1046+ * must be confirmed before calling this method.
1047+ * @param low the second (low) part of the presumed surrogate
1048+ * @return 0 if the pair of characters was written out as-is,
1049+ * or the unicode code point of the character represented by
1050+ * the surrogate pair if a numeric char ref with that value
1051+ * was written out. (REVIEW: Is this needed?)
1052+ *
1053+ * @throws IOException if invalid UTF-16 surrogate detected.
1054+ */
1055+ protected int writeUTF16Surrogate (final char high , final char low )
1056+ throws IOException
1057+ {
10301058 if (!Encodings .isLowUTF16Surrogate (low )) {
10311059 throw new IOException (
10321060 Utils .messages .createMessage (
@@ -1038,35 +1066,37 @@ protected int writeUTF16Surrogate(final char high, char ch[], int i, int end)
10381066 }
10391067
10401068 final java .io .Writer writer = m_writer ;
1041- int codePoint = 0 ; // Nonzero iff written as NCR
1069+ int codePoint = 0 ; // Nonzero iff written as NCR. REVIEW: Needed?
10421070
10431071 // If we make it to here we have a valid high, low surrogate pair
10441072 if (m_encodingInfo .isInEncoding (high ,low )) {
10451073 // If the character formed by the surrogate pair
10461074 // is in the encoding, so just write it out
10471075 // NOTE: Assumes same buffer
1048- writer .write (ch ,i ,2 );
1076+ writer .write (high );
1077+ writer .write (low );
10491078 }
10501079 else {
10511080 // Don't know what to do with this char, it is
10521081 // not in the encoding and not a high char in
10531082 // a surrogate pair, so write out as a numeric char ref
10541083 final String encoding = getEncoding ();
10551084 if (encoding != null ) {
1056- /* The output encoding is known,
1057- * so somthing is wrong.
1085+ /* The output encoding is known but does not include
1086+ * this character. Fallback: Write as NCR
10581087 */
10591088 codePoint = Encodings .toCodePoint (high , low );
1060- // not in the encoding, so write out a character reference
10611089 writer .write ('&' );
10621090 writer .write ('#' );
10631091 writer .write (Integer .toString (codePoint ));
10641092 writer .write (';' );
10651093 } else {
1066- /* The output encoding is not known,
1067- * so just write it out as-is.
1094+ /* The output encoding is not known, so presume
1095+ * Unicode and just write it out. This handles the
1096+ * case of serializing to a character buffer.
10681097 */
1069- writer .write (ch , i , 2 );
1098+ writer .write (high );
1099+ writer .write (low );
10701100 }
10711101 }
10721102
0 commit comments