@@ -72,6 +72,8 @@ public enum Feature implements FormatFeature {
7272 * Feature that determines if an invalid surrogate encoding found in the
7373 * incoming String should fail with an exception or silently be outputed
7474 * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
75+ *
76+ * @since 2.12
7577 */
7678 LENIENT_UTF_ENCODING (false ),
7779
@@ -150,6 +152,11 @@ public int getMask() {
150152
151153 protected boolean _cfgMinimalInts ;
152154
155+
156+ /**
157+ * If true we will output the REPLACEMENT_CHAR for invalid unicode sequences.
158+ * If false we will throw an IllegalArgumentException for invalid unicode sequences.
159+ */
153160 protected boolean _cfgLenientUnicodeEncoding ;
154161
155162 /*
@@ -1425,27 +1432,15 @@ private final int _encode2(int i, int outputPtr, char[] str, int len,
14251432 }
14261433 // Yup, a surrogate pair
14271434 if (c > SURR1_LAST ) { // must be from first range; second won't do
1428- if (_cfgLenientUnicodeEncoding ) {
1429- c = REPLACEMENT_CHAR ;
1430- } else {
1431- _throwIllegalSurrogate (c );
1432- }
1435+ c = _illegalSurrogateFound (c );
14331436 }
14341437 // ... meaning it must have a pair
14351438 else if (i >= len ) {
1436- if (_cfgLenientUnicodeEncoding ) {
1437- c = REPLACEMENT_CHAR ;
1438- } else {
1439- _throwIllegalSurrogate (c );
1440- }
1439+ c = _illegalSurrogateFound (c );
14411440 }
14421441 // ... verify that the next character is in range
14431442 else if (str [i ] < SURR2_FIRST || str [i ] > SURR2_LAST ) {
1444- if (_cfgLenientUnicodeEncoding ) {
1445- c = REPLACEMENT_CHAR ;
1446- } else {
1447- _throwIllegalSurrogatePair (c , str [i ]);
1448- }
1443+ c = _illegalSurrogatePairFound (c , str [i ]);
14491444 }
14501445 // ... we have a valid surrogate pair
14511446 else {
@@ -1473,43 +1468,47 @@ private int _convertSurrogate(int firstPart, int secondPart) {
14731468 int c = 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
14741469 + (secondPart - SURR2_FIRST );
14751470 if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1476- if (_cfgLenientUnicodeEncoding ) {
1477- c = REPLACEMENT_CHAR ;
1478- } else {
1479- _throwIllegalSurrogate (c );
1480- }
1471+ c = _illegalSurrogatePairFound (firstPart , secondPart );
14811472 }
14821473 return c ;
14831474 }
14841475
1485- private void _throwIllegalSurrogatePair (int firstPart , int secondPart ) {
1486- throw new IllegalArgumentException (
1487- "Broken surrogate pair: first char 0x"
1488- + Integer .toHexString (firstPart ) + ", second 0x"
1489- + Integer .toHexString (secondPart )
1490- + "; illegal combination" );
1476+ private int _illegalSurrogatePairFound (int firstPart , int secondPart ) {
1477+ if (_cfgLenientUnicodeEncoding ) {
1478+ return REPLACEMENT_CHAR ;
1479+ } else {
1480+ throw new IllegalArgumentException (
1481+ "Broken surrogate pair: first char 0x"
1482+ + Integer .toHexString (firstPart ) + ", second 0x"
1483+ + Integer .toHexString (secondPart )
1484+ + "; illegal combination" );
1485+ }
14911486 }
14921487
1493- private void _throwIllegalSurrogate (int code ) {
1494- if (code > 0x10FFFF ) { // over max?
1495- throw new IllegalArgumentException ("Illegal character point (0x"
1496- + Integer .toHexString (code )
1497- + ") to output; max is 0x10FFFF as per RFC 4627" );
1498- }
1499- if (code >= SURR1_FIRST ) {
1500- if (code <= SURR1_LAST ) { // Unmatched first part (closing without
1501- // second part?)
1488+ private int _illegalSurrogateFound (int code ) {
1489+ if (_cfgLenientUnicodeEncoding ) {
1490+ return REPLACEMENT_CHAR ;
1491+ } else {
1492+ if (code > 0x10FFFF ) { // over max?
1493+ throw new IllegalArgumentException ("Illegal character point (0x"
1494+ + Integer .toHexString (code )
1495+ + ") to output; max is 0x10FFFF as per RFC 4627" );
1496+ }
1497+ if (code >= SURR1_FIRST ) {
1498+ if (code <= SURR1_LAST ) { // Unmatched first part (closing without
1499+ // second part?)
1500+ throw new IllegalArgumentException (
1501+ "Unmatched first part of surrogate pair (0x"
1502+ + Integer .toHexString (code ) + ")" );
1503+ }
15021504 throw new IllegalArgumentException (
1503- "Unmatched first part of surrogate pair (0x"
1505+ "Unmatched second part of surrogate pair (0x"
15041506 + Integer .toHexString (code ) + ")" );
15051507 }
1506- throw new IllegalArgumentException (
1507- "Unmatched second part of surrogate pair (0x"
1508- + Integer .toHexString (code ) + ")" );
1508+ // should we ever get this?
1509+ throw new IllegalArgumentException ( "Illegal character point (0x"
1510+ + Integer .toHexString (code ) + ") to output " );
15091511 }
1510- // should we ever get this?
1511- throw new IllegalArgumentException ("Illegal character point (0x"
1512- + Integer .toHexString (code ) + ") to output" );
15131512 }
15141513
15151514 /*
0 commit comments