Skip to content

Commit dfb7277

Browse files
Document the characters()other()characters() issue if first char buffer ended in a high surrogate.
1 parent ec7f0e2 commit dfb7277

1 file changed

Lines changed: 101 additions & 101 deletions

File tree

serializer/src/main/java/org/apache/xml/serializer/ToStream.java

Lines changed: 101 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -1497,9 +1497,22 @@ else if (m_needToCallStartDocument)
14971497
// that was processed
14981498
final Writer writer = m_writer;
14991499
boolean isAllWhitespace = true;
1500-
1501-
// process any leading whitespace
15021500
i = start;
1501+
1502+
// Note: The case where m_pendingHighUTF16Surrogate is set upon entry
1503+
// but the first character is not the low surrogate is perplexing.
1504+
// THEORETICALLY, everything but characters() should recognize that
1505+
// case as meaning characters() just ended abnormally and flush or
1506+
// report the isolated high surrogate before they start, rather than
1507+
// leaving it for us to erroneously insert into the next character block.
1508+
// But that's a pretty pervasive change for a rare error case.
1509+
//
1510+
// (Not handling it that way risks the high surrogate being flushed into
1511+
// the start of the next characters() block, and that in turn would need
1512+
// a special case here or it would be flushed after the whitespace...
1513+
// This needs more thought. GONK TODO REVIEW.
1514+
1515+
// process any leading whitespace
15031516
while (i < end && isAllWhitespace) {
15041517
char ch1 = chars[i];
15051518

@@ -1558,6 +1571,7 @@ else if (m_needToCallStartDocument)
15581571
m_ispreserve = true;
15591572

15601573

1574+
// Process characters after initial whitespace (if any)
15611575
for (; i < end; i++)
15621576
{
15631577
char ch = chars[i];
@@ -1571,117 +1585,103 @@ else if (m_needToCallStartDocument)
15711585
writer.write(outputStringForChar);
15721586
lastDirtyCharProcessed = i;
15731587
}
1574-
else {
1575-
if (ch <= 0x1F) {
1576-
// Range 0x00 through 0x1F inclusive
1577-
//
1578-
// This covers the non-whitespace control characters
1579-
// in the range 0x1 to 0x1F inclusive.
1580-
// It also covers the whitespace control characters in the same way:
1581-
// 0x9 TAB
1582-
// 0xA NEW LINE
1583-
// 0xD CARRIAGE RETURN
1584-
//
1585-
// We also cover 0x0 ... It isn't valid
1586-
// but we will output "&#0;"
1587-
1588-
// The default will handle this just fine, but this
1589-
// is a little performance boost to handle the more
1590-
// common TAB, NEW-LINE, CARRIAGE-RETURN
1591-
switch (ch) {
1592-
1593-
case CharInfo.S_HORIZONAL_TAB:
1594-
// Leave whitespace TAB as a real character
1595-
break;
1596-
case CharInfo.S_LINEFEED:
1597-
lastDirtyCharProcessed = processLineFeed(chars, i, lastDirtyCharProcessed, writer);
1598-
break;
1599-
case CharInfo.S_CARRIAGERETURN:
1600-
writeOutCleanChars(chars, i, lastDirtyCharProcessed);
1601-
writer.write("&#13;");
1602-
lastDirtyCharProcessed = i;
1603-
// Leave whitespace carriage return as a real character
1604-
break;
1605-
default:
1606-
writeOutCleanChars(chars, i, lastDirtyCharProcessed);
1607-
writer.write("&#");
1608-
writer.write(Integer.toString(ch));
1609-
writer.write(';');
1610-
lastDirtyCharProcessed = i;
1611-
break;
1588+
else if (ch <= 0x1F) {
1589+
// Range 0x00 through 0x1F inclusive
1590+
//
1591+
// This covers the non-whitespace control characters
1592+
// in the range 0x1 to 0x1F inclusive.
1593+
// It also covers the whitespace control characters in the same way:
1594+
// 0x9 TAB
1595+
// 0xA NEW LINE
1596+
// 0xD CARRIAGE RETURN
1597+
//
1598+
// We also cover 0x0 ... It isn't valid
1599+
// but we will output "&#0;"
16121600

1613-
}
1614-
}
1615-
else if (ch < 0x7F) {
1616-
// Range 0x20 through 0x7E inclusive
1617-
// Normal ASCII chars, do nothing, just add it to
1618-
// the clean characters
1601+
// The default will handle this just fine, but this
1602+
// is a little performance boost to handle the more
1603+
// common TAB, NEW-LINE, CARRIAGE-RETURN
1604+
switch (ch) {
16191605

1620-
}
1621-
else if (ch <= 0x9F){
1622-
// Range 0x7F through 0x9F inclusive
1623-
// More control characters, including NEL (0x85)
1606+
case CharInfo.S_HORIZONAL_TAB:
1607+
// Leave whitespace TAB as a real character
1608+
break;
1609+
case CharInfo.S_LINEFEED:
1610+
lastDirtyCharProcessed = processLineFeed(chars, i, lastDirtyCharProcessed, writer);
1611+
break;
1612+
case CharInfo.S_CARRIAGERETURN:
1613+
writeOutCleanChars(chars, i, lastDirtyCharProcessed);
1614+
writer.write("&#13;");
1615+
lastDirtyCharProcessed = i;
1616+
// Leave whitespace carriage return as a real character
1617+
break;
1618+
default:
16241619
writeOutCleanChars(chars, i, lastDirtyCharProcessed);
16251620
writer.write("&#");
16261621
writer.write(Integer.toString(ch));
16271622
writer.write(';');
16281623
lastDirtyCharProcessed = i;
1624+
break;
1625+
16291626
}
1630-
else if (ch == CharInfo.S_LINE_SEPARATOR) {
1631-
// LINE SEPARATOR
1632-
writeOutCleanChars(chars, i, lastDirtyCharProcessed);
1633-
writer.write("&#8232;");
1634-
lastDirtyCharProcessed = i;
1635-
}
1636-
else if (m_encodingInfo.isInEncoding(ch)) {
1637-
// If the character is in the encoding, and
1638-
// not in the normal ASCII range, we also
1639-
// just leave it get added on to the clean characters
1640-
}
1641-
// else if (Encodings.isHighUTF16Surrogate(ch) && i < end-1 && Encodings.isLowUTF16Surrogate(chars[i+1])) {
1642-
// // So, this is a (valid) surrogate pair
1643-
// if (! m_encodingInfo.isInEncoding(ch, chars[i+1])) {
1644-
// int codepoint = Encodings.toCodePoint(ch, chars[i+1]);
1645-
// writeOutCleanChars(chars, i, lastDirtyCharProcessed);
1646-
// writer.write("&#");
1647-
// writer.write(Integer.toString(codepoint));
1648-
// writer.write(';');
1649-
// lastDirtyCharProcessed = i+1;
1650-
// } // Else pair is in encoding, not "dirty", just copy
1651-
// i++; // skip the low surrogate, too
1652-
// }
1653-
else if (Encodings.isHighUTF16Surrogate(ch)) {
1654-
writeOutCleanChars(chars,i,lastDirtyCharProcessed);
1655-
m_pendingHighUTF16Surrogate=ch;
1656-
lastDirtyCharProcessed=i;
1657-
}
1658-
else if (Encodings.isLowUTF16Surrogate(ch)) {
1659-
if(m_encodingInfo.isInEncoding(m_pendingHighUTF16Surrogate,ch)) {
1660-
char[] buffer= {m_pendingHighUTF16Surrogate,ch};
1661-
writer.write(buffer);
1662-
} else {
1663-
// Clean characters should have been flushed by high surrogate
1664-
int codepoint = Encodings.toCodePoint(m_pendingHighUTF16Surrogate,ch);
1665-
writer.write("&#");
1666-
writer.write(Integer.toString(codepoint));
1667-
writer.write(';');
1668-
}
1669-
m_pendingHighUTF16Surrogate=0;
1670-
lastDirtyCharProcessed=i;
1671-
}
1672-
else {
1673-
// This is a fallback plan, we get here if the
1674-
// encoding doesn't contain ch and it's not part
1675-
// of a surrogate pair
1676-
// The right thing is to write out an entity
1677-
writeOutCleanChars(chars, i, lastDirtyCharProcessed);
1627+
}
1628+
else if (ch < 0x7F) {
1629+
// Range 0x20 through 0x7E inclusive
1630+
// Normal ASCII chars, do nothing, just add it to
1631+
// the clean characters
1632+
1633+
}
1634+
else if (ch <= 0x9F){
1635+
// Range 0x7F through 0x9F inclusive
1636+
// More control characters, including NEL (0x85)
1637+
writeOutCleanChars(chars, i, lastDirtyCharProcessed);
1638+
writer.write("&#");
1639+
writer.write(Integer.toString(ch));
1640+
writer.write(';');
1641+
lastDirtyCharProcessed = i;
1642+
}
1643+
else if (ch == CharInfo.S_LINE_SEPARATOR) {
1644+
// LINE SEPARATOR
1645+
writeOutCleanChars(chars, i, lastDirtyCharProcessed);
1646+
writer.write("&#8232;");
1647+
lastDirtyCharProcessed = i;
1648+
}
1649+
else if (m_encodingInfo.isInEncoding(ch)) {
1650+
// If the character is in the encoding, and
1651+
// not in the normal ASCII range, we also
1652+
// just leave it get added on to the clean characters
1653+
}
1654+
else if (Encodings.isLowUTF16Surrogate(ch)) {
1655+
if(m_encodingInfo.isInEncoding(m_pendingHighUTF16Surrogate,ch)) {
1656+
char[] buffer= {m_pendingHighUTF16Surrogate,ch};
1657+
writer.write(buffer);
1658+
} else {
1659+
// Clean characters should have been flushed by high surrogate
1660+
int codepoint = Encodings.toCodePoint(m_pendingHighUTF16Surrogate,ch);
16781661
writer.write("&#");
1679-
writer.write(Integer.toString(ch));
1662+
writer.write(Integer.toString(codepoint));
16801663
writer.write(';');
1681-
lastDirtyCharProcessed = i;
16821664
}
1665+
m_pendingHighUTF16Surrogate=0;
1666+
lastDirtyCharProcessed=i;
16831667
}
1684-
}
1668+
else if (Encodings.isHighUTF16Surrogate(ch)) {
1669+
writeOutCleanChars(chars,i,lastDirtyCharProcessed);
1670+
m_pendingHighUTF16Surrogate=ch;
1671+
lastDirtyCharProcessed=i;
1672+
}
1673+
else {
1674+
// This is a fallback plan, we get here if the
1675+
// encoding doesn't contain ch and it's not part
1676+
// of a surrogate pair
1677+
// The right thing is to write out an entity
1678+
writeOutCleanChars(chars, i, lastDirtyCharProcessed);
1679+
writer.write("&#");
1680+
writer.write(Integer.toString(ch));
1681+
writer.write(';');
1682+
lastDirtyCharProcessed = i;
1683+
}
1684+
} // end input scan loop
16851685

16861686
// we've reached the end. Any clean characters at the
16871687
// end of the array than need to be written out?

0 commit comments

Comments
 (0)