From 02a2cbc707e3c8d7abcffb9393b78dc0f61fcca6 Mon Sep 17 00:00:00 2001
From: Guillaume Bort <guillaume.bort@gmail.com>
Date: Tue, 29 Sep 2020 11:26:32 +0200
Subject: [PATCH 1/2] Add a CBORGenerator feature for lenient unicode encoding

If enabled, the generator will output the Unicode Replacement Character
for invalid unicode sequence (invalid surrogate chars in the Java
String) instead of failing with an IllegalArgumentException
---
 .../dataformat/cbor/CBORGenerator.java        | 142 +++++++++---------
 .../jackson/dataformat/cbor/CBORTestBase.java |   8 +
 .../cbor/gen/UnicodeGenerationTest.java       | 114 ++++++++++++++
 3 files changed, 195 insertions(+), 69 deletions(-)
 create mode 100644 cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/UnicodeGenerationTest.java

diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java
index ebf8cb56b..062399429 100644
--- a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java
+++ b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java
@@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase
 {
     private final static int[] NO_INTS = new int[0];
 
+    /**
+     * The replacement character to use to fix invalid unicode sequences.
+     */
+    final static int REPLACEMENT_CHAR = 0xfffd;
+
     /**
      * Let's ensure that we have big enough output buffer because of safety
      * margins we need for UTF-8 encoding.
@@ -63,7 +68,14 @@ public enum Feature implements FormatFeature {
          *
          * @since 2.5
          */
-        WRITE_TYPE_HEADER(false)
+        WRITE_TYPE_HEADER(false),
+
+        /**
+         * Feature that determines if an invalid surrogate encoding found in the
+         * incoming String should fail with an exception or silently be outputed
+         * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
+         */
+        LENIENT_UTF_ENCODING(false),
 
         ;
 
@@ -140,6 +152,8 @@ public int getMask() {
 
     protected boolean _cfgMinimalInts;
 
+    protected boolean _cfgLenientUnicodeEncoding;
+
     /*
     /**********************************************************
     /* Output state
@@ -234,6 +248,7 @@ public CBORGenerator(IOContext ctxt, int stdFeatures, int formatFeatures,
         _cborContext = CBORWriteContext.createRootContext(dups);
         _formatFeatures = formatFeatures;
         _cfgMinimalInts = Feature.WRITE_MINIMAL_INTS.enabledIn(formatFeatures);
+        _cfgLenientUnicodeEncoding = Feature.LENIENT_UTF_ENCODING.enabledIn(formatFeatures);
         _ioContext = ctxt;
         _out = out;
         _bufferRecyclable = true;
@@ -406,6 +421,9 @@ public CBORGenerator enable(Feature f) {
         if (f == Feature.WRITE_MINIMAL_INTS) {
             _cfgMinimalInts = true;
         }
+        if (f == Feature.LENIENT_UTF_ENCODING) {
+            _cfgLenientUnicodeEncoding = true;
+        }
         return this;
     }
 
@@ -414,6 +432,9 @@ public CBORGenerator disable(Feature f) {
         if (f == Feature.WRITE_MINIMAL_INTS) {
             _cfgMinimalInts = false;
         }
+        if (f == Feature.LENIENT_UTF_ENCODING) {
+            _cfgLenientUnicodeEncoding = false;
+        }
         return this;
     }
 
@@ -1424,61 +1445,13 @@ private final int _encode(int outputPtr, char[] str, int i, int end) {
         do {
             int c = str[i];
             if (c > 0x7F) {
-                return _shortUTF8Encode2(str, i, end, outputPtr, outputStart);
+                return _encode2(i, outputPtr, str, end, outputStart);
             }
             outBuf[outputPtr++] = (byte) c;
         } while (++i < end);
         return outputPtr - outputStart;
     }
 
-    /**
-     * Helper method called when the whole character sequence is known to fit in
-     * the output buffer, but not all characters are single-byte (ASCII)
-     * characters.
-     */
-    private final int _shortUTF8Encode2(char[] str, int i, int end,
-            int outputPtr, int outputStart) {
-        final byte[] outBuf = _outputBuffer;
-        while (i < end) {
-            int c = str[i++];
-            if (c <= 0x7F) {
-                outBuf[outputPtr++] = (byte) c;
-                continue;
-            }
-            // Nope, multi-byte:
-            if (c < 0x800) { // 2-byte
-                outBuf[outputPtr++] = (byte) (0xc0 | (c >> 6));
-                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-                continue;
-            }
-            // 3 or 4 bytes (surrogate)
-            // Surrogates?
-            if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character
-                outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
-                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-                continue;
-            }
-            // Yup, a surrogate pair
-            if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
-            }
-            // ... meaning it must have a pair
-            if (i >= end) {
-                _throwIllegalSurrogate(c);
-            }
-            c = _convertSurrogate(c, str[i++]);
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
-            }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-        }
-        return (outputPtr - outputStart);
-    }
-
     private final int _encode(int outputPtr, String str, int len) {
         final byte[] outBuf = _outputBuffer;
         final int outputStart = outputPtr;
@@ -1486,19 +1459,19 @@ private final int _encode(int outputPtr, String str, int len) {
         for (int i = 0; i < len; ++i) {
             int c = str.charAt(i);
             if (c > 0x7F) {
-                return _encode2(i, outputPtr, str, len, outputStart);
+                return _encode2(i, outputPtr, str.toCharArray(), len, outputStart);
             }
             outBuf[outputPtr++] = (byte) c;
         }
         return (outputPtr - outputStart);
     }
 
-    private final int _encode2(int i, int outputPtr, String str, int len,
+    private final int _encode2(int i, int outputPtr, char[] str, int len,
             int outputStart) {
         final byte[] outBuf = _outputBuffer;
         // no; non-ASCII stuff, slower loop
         while (i < len) {
-            int c = str.charAt(i++);
+            int c = str[i++];
             if (c <= 0x7F) {
                 outBuf[outputPtr++] = (byte) c;
                 continue;
@@ -1520,20 +1493,43 @@ private final int _encode2(int i, int outputPtr, String str, int len,
             }
             // Yup, a surrogate pair
             if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
+                if (_cfgLenientUnicodeEncoding) {
+                    c = REPLACEMENT_CHAR;
+                } else {
+                    _throwIllegalSurrogate(c);
+                }
             }
             // ... meaning it must have a pair
-            if (i >= len) {
-                _throwIllegalSurrogate(c);
+            else if (i >= len) {
+               if (_cfgLenientUnicodeEncoding) {
+                    c = REPLACEMENT_CHAR;
+                } else {
+                    _throwIllegalSurrogate(c);
+                }
             }
-            c = _convertSurrogate(c, str.charAt(i++));
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
+            // ... verify that the next character is in range
+            else if (str[i] < SURR2_FIRST || str[i] > SURR2_LAST) {
+                if (_cfgLenientUnicodeEncoding) {
+                    c = REPLACEMENT_CHAR;
+                } else {
+                    _throwIllegalSurrogatePair(c, str[i]);
+                }
+            }
+            // ... we have a valid surrogate pair
+            else {
+                c = _convertSurrogate(c, str[i++]);
+            }
+            // if we replaced by the replacement char we actually have a 3 bytes char
+            if (c == REPLACEMENT_CHAR) {
+                outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
+            } else {
+                outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
             }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
         }
         return (outputPtr - outputStart);
     }
@@ -1542,16 +1538,24 @@ private final int _encode2(int i, int outputPtr, String str, int len,
      * Method called to calculate UTF codepoint, from a surrogate pair.
      */
     private int _convertSurrogate(int firstPart, int secondPart) {
-        // Ok, then, is the second part valid?
-        if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
-            throw new IllegalArgumentException(
+        int c = 0x10000 + ((firstPart - SURR1_FIRST) << 10)
+                + (secondPart - SURR2_FIRST);
+        if (c > 0x10FFFF) { // illegal in JSON as well as in XML
+            if (_cfgLenientUnicodeEncoding) {
+                c = REPLACEMENT_CHAR;
+            } else {
+                _throwIllegalSurrogate(c);
+            }
+        }
+        return c;
+    }
+
+    private void _throwIllegalSurrogatePair(int firstPart, int secondPart) {
+         throw new IllegalArgumentException(
                     "Broken surrogate pair: first char 0x"
                             + Integer.toHexString(firstPart) + ", second 0x"
                             + Integer.toHexString(secondPart)
                             + "; illegal combination");
-        }
-        return 0x10000 + ((firstPart - SURR1_FIRST) << 10)
-                + (secondPart - SURR2_FIRST);
     }
 
     private void _throwIllegalSurrogate(int code) {
diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java
index 43c609637..a4a4e553d 100644
--- a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java
+++ b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java
@@ -85,6 +85,14 @@ protected CBORGenerator cborGenerator(CBORFactory f,
         return f.createGenerator(result, null);
     }
 
+    protected CBORGenerator lenientUnicodeCborGenerator(ByteArrayOutputStream result)
+        throws IOException
+    {
+        CBORGenerator gen = cborGenerator(result);
+        gen.enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING);
+        return gen;
+    }
+
     /*
     /**********************************************************
     /* Additional assertion methods
diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/UnicodeGenerationTest.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/UnicodeGenerationTest.java
new file mode 100644
index 000000000..5fb96d0a2
--- /dev/null
+++ b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/UnicodeGenerationTest.java
@@ -0,0 +1,114 @@
+
+import java.io.*;
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.util.*;
+
+import org.junit.Assert;
+
+import com.fasterxml.jackson.core.JsonGenerationException;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import com.fasterxml.jackson.dataformat.cbor.CBORConstants;
+import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+import com.fasterxml.jackson.dataformat.cbor.CBORParser;
+import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;
+
+public class UnicodeGenerationTest extends CBORTestBase
+{   
+    /**
+     * Test that encoding a String containing invalid surrogates fail with an exception
+     */
+    public void testFailForInvalidSurrogate() throws Exception
+    {
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        CBORGenerator gen = cborGenerator(out);
+
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched first surrogate character
+        try { 
+            gen.writeString("x\ud83d");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched second surrogate character
+        try { 
+            gen.writeString("x\ude01");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched second surrogate character (2)
+        try { 
+            gen.writeString("x\ude01x");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Broken surrogate pair
+        try { 
+            gen.writeString("x\ud83dx");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+    }
+
+    /**
+     * Test that when the lenient unicode feature is enabled, the replacement character is used to fix invalid sequences
+     */
+    public void testRecoverInvalidSurrogate() throws Exception
+    {
+        ByteArrayOutputStream out;
+        CBORGenerator gen;
+        byte[] b;
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+    
+        // Unmatched first surrogate character
+        gen.writeString("x\ud83d");
+        gen.close();
+        b = "x\ufffd".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+    
+        // Unmatched second surrogate character
+        gen.writeString("x\ude01");
+        gen.close();
+        b = "x\ufffd".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+    
+        // Unmatched second surrogate character (2)
+        gen.writeString("x\ude01x");
+        gen.close();
+        b = "x\ufffdx".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+    
+        // Broken surrogate pair
+        gen.writeString("x\ud83dx");
+        gen.close();
+        b = "x\ufffdx".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+    }
+
+}
\ No newline at end of file

From 5760c702c1d691743a2c592c67bb71a8b978bca8 Mon Sep 17 00:00:00 2001
From: Guillaume Bort <guillaume.bort@gmail.com>
Date: Wed, 30 Sep 2020 10:23:53 +0200
Subject: [PATCH 2/2] Address review comments

---
 .../dataformat/cbor/CBORGenerator.java        | 83 +++++++++----------
 1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java
index 062399429..7b75bd10a 100644
--- a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java
+++ b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java
@@ -74,6 +74,8 @@ public enum Feature implements FormatFeature {
          * Feature that determines if an invalid surrogate encoding found in the
          * incoming String should fail with an exception or silently be outputed
          * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
+         *
+         * @since 2.12
          */
         LENIENT_UTF_ENCODING(false),
 
@@ -152,6 +154,11 @@ public int getMask() {
 
     protected boolean _cfgMinimalInts;
 
+
+    /**
+     * If true we will output the REPLACEMENT_CHAR for invalid unicode sequences.
+     * If false we will throw an IllegalArgumentException for invalid unicode sequences.
+     */
     protected boolean _cfgLenientUnicodeEncoding;
 
     /*
@@ -1493,27 +1500,15 @@ private final int _encode2(int i, int outputPtr, char[] str, int len,
             }
             // Yup, a surrogate pair
             if (c > SURR1_LAST) { // must be from first range; second won't do
-                if (_cfgLenientUnicodeEncoding) {
-                    c = REPLACEMENT_CHAR;
-                } else {
-                    _throwIllegalSurrogate(c);
-                }
+                c = _illegalSurrogateFound(c);
             }
             // ... meaning it must have a pair
             else if (i >= len) {
-               if (_cfgLenientUnicodeEncoding) {
-                    c = REPLACEMENT_CHAR;
-                } else {
-                    _throwIllegalSurrogate(c);
-                }
+                c = _illegalSurrogateFound(c);
             }
             // ... verify that the next character is in range
             else if (str[i] < SURR2_FIRST || str[i] > SURR2_LAST) {
-                if (_cfgLenientUnicodeEncoding) {
-                    c = REPLACEMENT_CHAR;
-                } else {
-                    _throwIllegalSurrogatePair(c, str[i]);
-                }
+                c = _illegalSurrogatePairFound(c, str[i]);
             }
             // ... we have a valid surrogate pair
             else {
@@ -1541,43 +1536,47 @@ private int _convertSurrogate(int firstPart, int secondPart) {
         int c = 0x10000 + ((firstPart - SURR1_FIRST) << 10)
                 + (secondPart - SURR2_FIRST);
         if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-            if (_cfgLenientUnicodeEncoding) {
-                c = REPLACEMENT_CHAR;
-            } else {
-                _throwIllegalSurrogate(c);
-            }
+            c = _illegalSurrogatePairFound(firstPart, secondPart);
         }
         return c;
     }
 
-    private void _throwIllegalSurrogatePair(int firstPart, int secondPart) {
-         throw new IllegalArgumentException(
-                    "Broken surrogate pair: first char 0x"
-                            + Integer.toHexString(firstPart) + ", second 0x"
-                            + Integer.toHexString(secondPart)
-                            + "; illegal combination");
+    private int _illegalSurrogatePairFound(int firstPart, int secondPart) {
+        if (_cfgLenientUnicodeEncoding) {
+            return REPLACEMENT_CHAR;
+        } else {
+            throw new IllegalArgumentException(
+                "Broken surrogate pair: first char 0x"
+                        + Integer.toHexString(firstPart) + ", second 0x"
+                        + Integer.toHexString(secondPart)
+                        + "; illegal combination");
+        }
     }
 
-    private void _throwIllegalSurrogate(int code) {
-        if (code > 0x10FFFF) { // over max?
-            throw new IllegalArgumentException("Illegal character point (0x"
-                    + Integer.toHexString(code)
-                    + ") to output; max is 0x10FFFF as per RFC 4627");
-        }
-        if (code >= SURR1_FIRST) {
-            if (code <= SURR1_LAST) { // Unmatched first part (closing without
-                                      // second part?)
+    private int _illegalSurrogateFound(int code) {
+        if (_cfgLenientUnicodeEncoding) {
+            return REPLACEMENT_CHAR;
+        } else {
+            if (code > 0x10FFFF) { // over max?
+                throw new IllegalArgumentException("Illegal character point (0x"
+                        + Integer.toHexString(code)
+                        + ") to output; max is 0x10FFFF as per RFC 4627");
+            }
+            if (code >= SURR1_FIRST) {
+                if (code <= SURR1_LAST) { // Unmatched first part (closing without
+                                        // second part?)
+                    throw new IllegalArgumentException(
+                            "Unmatched first part of surrogate pair (0x"
+                                    + Integer.toHexString(code) + ")");
+                }
                 throw new IllegalArgumentException(
-                        "Unmatched first part of surrogate pair (0x"
+                        "Unmatched second part of surrogate pair (0x"
                                 + Integer.toHexString(code) + ")");
             }
-            throw new IllegalArgumentException(
-                    "Unmatched second part of surrogate pair (0x"
-                            + Integer.toHexString(code) + ")");
+            // should we ever get this?
+            throw new IllegalArgumentException("Illegal character point (0x"
+                    + Integer.toHexString(code) + ") to output");
         }
-        // should we ever get this?
-        throw new IllegalArgumentException("Illegal character point (0x"
-                + Integer.toHexString(code) + ") to output");
     }
 
     /*