@@ -1497,9 +1497,22 @@ else if (m_needToCallStartDocument)
14971497 // that was processed
14981498 final Writer writer = m_writer ;
14991499 boolean isAllWhitespace = true ;
1500-
1501- // process any leading whitespace
15021500 i = start ;
1501+
1502+ // Note: The case where m_pendingHighUTF16Surrogate is set upon entry
1503+ // but the first character is not the low surrogate is perplexing.
1504+ // THEORETICALLY, everything but characters() should recognize that
1505+ // case as meaning characters() just ended abnormally and flush or
1506+ // report the isolated high surrogate before they start, rather than
1507+ // leaving it for us to erroneously insert into the next character block.
1508+ // But that's a pretty pervasive change for a rare error case.
1509+ //
1510+ // (Not handling it that way risks the high surrogate being flushed into
1511+ // the start of the next characters() block, and that in turn would need
1512+ // a special case here or it would be flushed after the whitespace...
1513+ // This needs more thought. GONK TODO REVIEW.
1514+
1515+ // process any leading whitespace
15031516 while (i < end && isAllWhitespace ) {
15041517 char ch1 = chars [i ];
15051518
@@ -1558,6 +1571,7 @@ else if (m_needToCallStartDocument)
15581571 m_ispreserve = true ;
15591572
15601573
1574+ // Process characters after initial whitespace (if any)
15611575 for (; i < end ; i ++)
15621576 {
15631577 char ch = chars [i ];
@@ -1571,117 +1585,103 @@ else if (m_needToCallStartDocument)
15711585 writer .write (outputStringForChar );
15721586 lastDirtyCharProcessed = i ;
15731587 }
1574- else {
1575- if (ch <= 0x1F ) {
1576- // Range 0x00 through 0x1F inclusive
1577- //
1578- // This covers the non-whitespace control characters
1579- // in the range 0x1 to 0x1F inclusive.
1580- // It also covers the whitespace control characters in the same way:
1581- // 0x9 TAB
1582- // 0xA NEW LINE
1583- // 0xD CARRIAGE RETURN
1584- //
1585- // We also cover 0x0 ... It isn't valid
1586- // but we will output "�"
1587-
1588- // The default will handle this just fine, but this
1589- // is a little performance boost to handle the more
1590- // common TAB, NEW-LINE, CARRIAGE-RETURN
1591- switch (ch ) {
1592-
1593- case CharInfo .S_HORIZONAL_TAB :
1594- // Leave whitespace TAB as a real character
1595- break ;
1596- case CharInfo .S_LINEFEED :
1597- lastDirtyCharProcessed = processLineFeed (chars , i , lastDirtyCharProcessed , writer );
1598- break ;
1599- case CharInfo .S_CARRIAGERETURN :
1600- writeOutCleanChars (chars , i , lastDirtyCharProcessed );
1601- writer .write (" " );
1602- lastDirtyCharProcessed = i ;
1603- // Leave whitespace carriage return as a real character
1604- break ;
1605- default :
1606- writeOutCleanChars (chars , i , lastDirtyCharProcessed );
1607- writer .write ("&#" );
1608- writer .write (Integer .toString (ch ));
1609- writer .write (';' );
1610- lastDirtyCharProcessed = i ;
1611- break ;
1588+ else if (ch <= 0x1F ) {
1589+ // Range 0x00 through 0x1F inclusive
1590+ //
1591+ // This covers the non-whitespace control characters
1592+ // in the range 0x1 to 0x1F inclusive.
1593+ // It also covers the whitespace control characters in the same way:
1594+ // 0x9 TAB
1595+ // 0xA NEW LINE
1596+ // 0xD CARRIAGE RETURN
1597+ //
1598+ // We also cover 0x0 ... It isn't valid
1599+ // but we will output "�"
16121600
1613- }
1614- }
1615- else if (ch < 0x7F ) {
1616- // Range 0x20 through 0x7E inclusive
1617- // Normal ASCII chars, do nothing, just add it to
1618- // the clean characters
1601+ // The default will handle this just fine, but this
1602+ // is a little performance boost to handle the more
1603+ // common TAB, NEW-LINE, CARRIAGE-RETURN
1604+ switch (ch ) {
16191605
1620- }
1621- else if (ch <= 0x9F ){
1622- // Range 0x7F through 0x9F inclusive
1623- // More control characters, including NEL (0x85)
1606+ case CharInfo .S_HORIZONAL_TAB :
1607+ // Leave whitespace TAB as a real character
1608+ break ;
1609+ case CharInfo .S_LINEFEED :
1610+ lastDirtyCharProcessed = processLineFeed (chars , i , lastDirtyCharProcessed , writer );
1611+ break ;
1612+ case CharInfo .S_CARRIAGERETURN :
1613+ writeOutCleanChars (chars , i , lastDirtyCharProcessed );
1614+ writer .write (" " );
1615+ lastDirtyCharProcessed = i ;
1616+ // Leave whitespace carriage return as a real character
1617+ break ;
1618+ default :
16241619 writeOutCleanChars (chars , i , lastDirtyCharProcessed );
16251620 writer .write ("&#" );
16261621 writer .write (Integer .toString (ch ));
16271622 writer .write (';' );
16281623 lastDirtyCharProcessed = i ;
1624+ break ;
1625+
16291626 }
1630- else if (ch == CharInfo .S_LINE_SEPARATOR ) {
1631- // LINE SEPARATOR
1632- writeOutCleanChars (chars , i , lastDirtyCharProcessed );
1633- writer .write ("
" );
1634- lastDirtyCharProcessed = i ;
1635- }
1636- else if (m_encodingInfo .isInEncoding (ch )) {
1637- // If the character is in the encoding, and
1638- // not in the normal ASCII range, we also
1639- // just leave it get added on to the clean characters
1640- }
1641- // else if (Encodings.isHighUTF16Surrogate(ch) && i < end-1 && Encodings.isLowUTF16Surrogate(chars[i+1])) {
1642- // // So, this is a (valid) surrogate pair
1643- // if (! m_encodingInfo.isInEncoding(ch, chars[i+1])) {
1644- // int codepoint = Encodings.toCodePoint(ch, chars[i+1]);
1645- // writeOutCleanChars(chars, i, lastDirtyCharProcessed);
1646- // writer.write("&#");
1647- // writer.write(Integer.toString(codepoint));
1648- // writer.write(';');
1649- // lastDirtyCharProcessed = i+1;
1650- // } // Else pair is in encoding, not "dirty", just copy
1651- // i++; // skip the low surrogate, too
1652- // }
1653- else if (Encodings .isHighUTF16Surrogate (ch )) {
1654- writeOutCleanChars (chars ,i ,lastDirtyCharProcessed );
1655- m_pendingHighUTF16Surrogate =ch ;
1656- lastDirtyCharProcessed =i ;
1657- }
1658- else if (Encodings .isLowUTF16Surrogate (ch )) {
1659- if (m_encodingInfo .isInEncoding (m_pendingHighUTF16Surrogate ,ch )) {
1660- char [] buffer = {m_pendingHighUTF16Surrogate ,ch };
1661- writer .write (buffer );
1662- } else {
1663- // Clean characters should have been flushed by high surrogate
1664- int codepoint = Encodings .toCodePoint (m_pendingHighUTF16Surrogate ,ch );
1665- writer .write ("&#" );
1666- writer .write (Integer .toString (codepoint ));
1667- writer .write (';' );
1668- }
1669- m_pendingHighUTF16Surrogate =0 ;
1670- lastDirtyCharProcessed =i ;
1671- }
1672- else {
1673- // This is a fallback plan, we get here if the
1674- // encoding doesn't contain ch and it's not part
1675- // of a surrogate pair
1676- // The right thing is to write out an entity
1677- writeOutCleanChars (chars , i , lastDirtyCharProcessed );
1627+ }
1628+ else if (ch < 0x7F ) {
1629+ // Range 0x20 through 0x7E inclusive
1630+ // Normal ASCII chars, do nothing, just add it to
1631+ // the clean characters
1632+
1633+ }
1634+ else if (ch <= 0x9F ){
1635+ // Range 0x7F through 0x9F inclusive
1636+ // More control characters, including NEL (0x85)
1637+ writeOutCleanChars (chars , i , lastDirtyCharProcessed );
1638+ writer .write ("&#" );
1639+ writer .write (Integer .toString (ch ));
1640+ writer .write (';' );
1641+ lastDirtyCharProcessed = i ;
1642+ }
1643+ else if (ch == CharInfo .S_LINE_SEPARATOR ) {
1644+ // LINE SEPARATOR
1645+ writeOutCleanChars (chars , i , lastDirtyCharProcessed );
1646+ writer .write ("
" );
1647+ lastDirtyCharProcessed = i ;
1648+ }
1649+ else if (m_encodingInfo .isInEncoding (ch )) {
1650+ // If the character is in the encoding, and
1651+ // not in the normal ASCII range, we also
1652+ // just leave it get added on to the clean characters
1653+ }
1654+ else if (Encodings .isLowUTF16Surrogate (ch )) {
1655+ if (m_encodingInfo .isInEncoding (m_pendingHighUTF16Surrogate ,ch )) {
1656+ char [] buffer = {m_pendingHighUTF16Surrogate ,ch };
1657+ writer .write (buffer );
1658+ } else {
1659+ // Clean characters should have been flushed by high surrogate
1660+ int codepoint = Encodings .toCodePoint (m_pendingHighUTF16Surrogate ,ch );
16781661 writer .write ("&#" );
1679- writer .write (Integer .toString (ch ));
1662+ writer .write (Integer .toString (codepoint ));
16801663 writer .write (';' );
1681- lastDirtyCharProcessed = i ;
16821664 }
1665+ m_pendingHighUTF16Surrogate =0 ;
1666+ lastDirtyCharProcessed =i ;
16831667 }
1684- }
1668+ else if (Encodings .isHighUTF16Surrogate (ch )) {
1669+ writeOutCleanChars (chars ,i ,lastDirtyCharProcessed );
1670+ m_pendingHighUTF16Surrogate =ch ;
1671+ lastDirtyCharProcessed =i ;
1672+ }
1673+ else {
1674+ // This is a fallback plan, we get here if the
1675+ // encoding doesn't contain ch and it's not part
1676+ // of a surrogate pair
1677+ // The right thing is to write out an entity
1678+ writeOutCleanChars (chars , i , lastDirtyCharProcessed );
1679+ writer .write ("&#" );
1680+ writer .write (Integer .toString (ch ));
1681+ writer .write (';' );
1682+ lastDirtyCharProcessed = i ;
1683+ }
1684+ } // end input scan loop
16851685
16861686 // we've reached the end. Any clean characters at the
16871687 // end of the array than need to be written out?
0 commit comments