@@ -28,6 +28,14 @@ public class CBORGenerator extends GeneratorBase
2828 */
2929 final static int BYTE_BUFFER_FOR_OUTPUT = 16000 ;
3030
31+ /**
32+ * The replacement character to use to fix invalid Unicode sequences
33+ * (mismatched surrogate pair).
34+ *
35+ * @since 2.12
36+ */
37+ final static int REPLACEMENT_CHAR = 0xfffd ;
38+
3139 /**
3240 * Longest char chunk we will output is chosen so that it is guaranteed to
3341 * fit in an empty buffer even if everything encoded in 3-byte sequences;
@@ -58,13 +66,25 @@ public enum Feature implements FormatFeature {
5866 * 55799, encoded as 3-byte sequence of <code>0xD9, 0xD9, 0xF7</code>)
5967 * should be written at the beginning of document or not.
6068 * <p>
61- * Default value is < code> false</code> meaning that type tag will not be
69+ * Default value is {@ code false} meaning that type tag will not be
6270 * written at the beginning of a new document.
6371 *
6472 * @since 2.5
6573 */
66- WRITE_TYPE_HEADER (false )
74+ WRITE_TYPE_HEADER (false ),
6775
76+ /**
77+ * Feature that determines if an invalid surrogate encoding found in the
78+ * incoming String should fail with an exception or silently be output
79+ * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD) or not; if not,
80+ * an exception will be thrown to indicate invalid content.
81+ *<p>
82+ * Default value is {@code false} (for backwards compatibility) meaning that
83+ * an invalide surrogate will result in exception ({@link IllegalArgumentException}
84+ *
85+ * @since 2.12
86+ */
87+ LENIENT_UTF_ENCODING (false ),
6888 ;
6989
7090 protected final boolean _defaultState ;
@@ -201,7 +221,7 @@ public int getMask() {
201221
202222 /**
203223 * Number of elements remaining in the current complex structure (if any),
204- * when writing defined-length Arrays, Objects; marker {@link # INDEFINITE_LENGTH}
224+ * when writing defined-length Arrays, Objects; marker {code INDEFINITE_LENGTH}
205225 * otherwise.
206226 */
207227 protected int _currentRemainingElements = INDEFINITE_LENGTH ;
@@ -1452,29 +1472,25 @@ private final int _shortUTF8Encode2(char[] str, int i, int end,
14521472 continue ;
14531473 }
14541474 // 3 or 4 bytes (surrogate)
1455- // Surrogates?
1456- if (c < SURR1_FIRST || c > SURR2_LAST ) { // nope, regular 3-byte character
1475+ if (c < SURR1_FIRST || c > SURR2_LAST ) { // regular 3-byte character
14571476 outBuf [outputPtr ++] = (byte ) (0xe0 | (c >> 12 ));
14581477 outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
14591478 outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
14601479 continue ;
14611480 }
1462- // Yup, a surrogate pair
1463- if (c > SURR1_LAST ) { // must be from first range; second won't do
1464- _throwIllegalSurrogate (c );
1465- }
1466- // ... meaning it must have a pair
1467- if (i >= end ) {
1468- _throwIllegalSurrogate (c );
1469- }
1470- c = _convertSurrogate (c , str [i ++]);
1471- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1472- _throwIllegalSurrogate (c );
1481+ // Yup, looks like a surrogate pair... but is it?
1482+ if ((c <= SURR1_LAST ) && (i < end )) { // must be from first range and have another char
1483+ final int d = str [i ];
1484+ if ((d <= SURR2_LAST ) && (d >= SURR2_FIRST )) {
1485+ ++i ;
1486+ outputPtr = _decodeAndWriteSurrogate (c , d , outBuf , outputPtr );
1487+ continue ;
1488+ }
1489+ outputPtr = _invalidSurrogateEnd (c , d , outBuf , outputPtr );
1490+ continue ;
14731491 }
1474- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1475- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1476- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1477- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1492+ // Nah, something wrong
1493+ outputPtr = _invalidSurrogateStart (c , outBuf , outputPtr );
14781494 }
14791495 return (outputPtr - outputStart );
14801496 }
@@ -1510,70 +1526,76 @@ private final int _encode2(int i, int outputPtr, String str, int len,
15101526 continue ;
15111527 }
15121528 // 3 or 4 bytes (surrogate)
1513- // Surrogates?
1514- if (c < SURR1_FIRST || c > SURR2_LAST ) { // nope, regular 3-byte
1515- // character
1529+ if (c < SURR1_FIRST || c > SURR2_LAST ) { // regular 3-byte character
15161530 outBuf [outputPtr ++] = (byte ) (0xe0 | (c >> 12 ));
15171531 outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
15181532 outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
15191533 continue ;
15201534 }
1521- // Yup, a surrogate pair
1522- if (c > SURR1_LAST ) { // must be from first range; second won't do
1523- _throwIllegalSurrogate (c );
1524- }
1525- // ... meaning it must have a pair
1526- if (i >= len ) {
1527- _throwIllegalSurrogate (c );
1528- }
1529- c = _convertSurrogate (c , str .charAt (i ++));
1530- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1531- _throwIllegalSurrogate (c );
1535+ // Yup, looks like a surrogate pair... but is it?
1536+ if ((c <= SURR1_LAST ) && (i < len )) { // must be from first range and have another char
1537+ final int d = str .charAt (i );
1538+ if ((d <= SURR2_LAST ) && (d >= SURR2_FIRST )) {
1539+ ++i ;
1540+ outputPtr = _decodeAndWriteSurrogate (c , d , outBuf , outputPtr );
1541+ continue ;
1542+ }
1543+ outputPtr = _invalidSurrogateEnd (c , d , outBuf , outputPtr );
1544+ continue ;
15321545 }
1533- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1534- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1535- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1536- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1546+ // Nah, something wrong
1547+ outputPtr = _invalidSurrogateStart (c , outBuf , outputPtr );
15371548 }
15381549 return (outputPtr - outputStart );
15391550 }
15401551
1541- /**
1542- * Method called to calculate UTF codepoint, from a surrogate pair.
1543- */
1544- private int _convertSurrogate (int firstPart , int secondPart ) {
1545- // Ok, then, is the second part valid?
1546- if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST ) {
1547- throw new IllegalArgumentException (
1548- "Broken surrogate pair: first char 0x"
1549- + Integer .toHexString (firstPart ) + ", second 0x"
1550- + Integer .toHexString (secondPart )
1551- + "; illegal combination" );
1552- }
1553- return 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
1554- + (secondPart - SURR2_FIRST );
1555- }
1556-
1557- private void _throwIllegalSurrogate (int code ) {
1558- if (code > 0x10FFFF ) { // over max?
1559- throw new IllegalArgumentException ("Illegal character point (0x"
1560- + Integer .toHexString (code )
1561- + ") to output; max is 0x10FFFF as per RFC 4627" );
1562- }
1563- if (code >= SURR1_FIRST ) {
1564- if (code <= SURR1_LAST ) { // Unmatched first part (closing without
1565- // second part?)
1566- throw new IllegalArgumentException (
1567- "Unmatched first part of surrogate pair (0x"
1568- + Integer .toHexString (code ) + ")" );
1569- }
1570- throw new IllegalArgumentException (
1571- "Unmatched second part of surrogate pair (0x"
1572- + Integer .toHexString (code ) + ")" );
1552+ private int _invalidSurrogateStart (int code , byte [] outBuf , int outputPtr ) {
1553+ if (isEnabled (Feature .LENIENT_UTF_ENCODING )) {
1554+ return _appendReplacementChar (outBuf , outputPtr );
1555+ }
1556+ // Will be called in two distinct cases: either first character is
1557+ // invalid (code range of second part), or first character is valid
1558+ // but there is no second part to encode
1559+ if (code <= SURR1_LAST ) {
1560+ // Unmatched first part (closing without second part?)
1561+ throw new IllegalArgumentException (String .format (
1562+ "Unmatched surrogate pair, starts with valid high surrogate (0x%04X) but ends without low surrogate" ,
1563+ code ));
1564+ }
1565+ throw new IllegalArgumentException (String .format (
1566+ "Invalid surrogate pair, starts with invalid high surrogate (0x%04X), not in valid range [0xD800, 0xDBFF]" ,
1567+ code ));
1568+ }
1569+
1570+ private int _invalidSurrogateEnd (int surr1 , int surr2 ,
1571+ byte [] outBuf , int outputPtr )
1572+ {
1573+ if (isEnabled (Feature .LENIENT_UTF_ENCODING )) {
1574+ return _appendReplacementChar (outBuf , outputPtr );
15731575 }
1574- // should we ever get this?
1575- throw new IllegalArgumentException ("Illegal character point (0x"
1576- + Integer .toHexString (code ) + ") to output" );
1576+ throw new IllegalArgumentException (String .format (
1577+ "Invalid surrogate pair, starts with valid high surrogate (0x%04X)"
1578+ +" but ends with invalid low surrogate (0x%04X), not in valid range [0xDC00, 0xDFFF]" ,
1579+ surr1 , surr2 ));
1580+ }
1581+
1582+ private int _appendReplacementChar (byte [] outBuf , int outputPtr ) {
1583+ outBuf [outputPtr ++] = (byte ) (0xe0 | (REPLACEMENT_CHAR >> 12 ));
1584+ outBuf [outputPtr ++] = (byte ) (0x80 | ((REPLACEMENT_CHAR >> 6 ) & 0x3f ));
1585+ outBuf [outputPtr ++] = (byte ) (0x80 | (REPLACEMENT_CHAR & 0x3f ));
1586+ return outputPtr ;
1587+ }
1588+
1589+ private int _decodeAndWriteSurrogate (int surr1 , int surr2 ,
1590+ byte [] outBuf , int outputPtr )
1591+ {
1592+ final int c = 0x10000 + ((surr1 - SURR1_FIRST ) << 10 )
1593+ + (surr2 - SURR2_FIRST );
1594+ outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1595+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1596+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1597+ outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1598+ return outputPtr ;
15771599 }
15781600
15791601 /*
0 commit comments