@@ -1299,7 +1299,28 @@ - (NSString *)diff_text2:(NSMutableArray *)diffs;
12991299- (NSString *)diff_toDelta : (NSMutableArray *)diffs ;
13001300{
13011301 NSMutableString *delta = [NSMutableString string ];
1302+ UniChar lastEnd = 0 ;
13021303 for (Diff *aDiff in diffs) {
1304+ if (0 == [aDiff.text length ]) {
1305+ continue ;
1306+ }
1307+
1308+ UniChar thisTop = [aDiff.text characterAtIndex: 0 ];
1309+ UniChar thisEnd = [aDiff.text characterAtIndex: ([aDiff.text length ]-1 )];
1310+
1311+ if (CFStringIsSurrogateHighCharacter (thisEnd)) {
1312+ lastEnd = thisEnd;
1313+ aDiff.text = [aDiff.text substringToIndex: ([aDiff.text length ] - 1 )];
1314+ }
1315+
1316+ if (0 != lastEnd && CFStringIsSurrogateHighCharacter (lastEnd) && CFStringIsSurrogateLowCharacter (thisTop)) {
1317+ aDiff.text = [NSString stringWithFormat: @" %C %@ " , lastEnd, aDiff.text];
1318+ }
1319+
1320+ if (0 == [aDiff.text length ]) {
1321+ continue ;
1322+ }
1323+
13031324 switch (aDiff.operation ) {
13041325 case DIFF_INSERT:
13051326 [delta appendFormat: @" +%@ \t " , [[aDiff.text diff_stringByAddingPercentEscapesForEncodeUriCompatibility ]
@@ -1321,6 +1342,176 @@ - (NSString *)diff_toDelta:(NSMutableArray *)diffs;
13211342 return delta;
13221343}
13231344
1345+ - (NSUInteger )diff_digit16 : (unichar )c
1346+ {
1347+ switch (c) {
1348+ case ' 0' : return 0 ;
1349+ case ' 1' : return 1 ;
1350+ case ' 2' : return 2 ;
1351+ case ' 3' : return 3 ;
1352+ case ' 4' : return 4 ;
1353+ case ' 5' : return 5 ;
1354+ case ' 6' : return 6 ;
1355+ case ' 7' : return 7 ;
1356+ case ' 8' : return 8 ;
1357+ case ' 9' : return 9 ;
1358+ case ' A' : case ' a' : return 10 ;
1359+ case ' B' : case ' b' : return 11 ;
1360+ case ' C' : case ' c' : return 12 ;
1361+ case ' D' : case ' d' : return 13 ;
1362+ case ' E' : case ' e' : return 14 ;
1363+ case ' F' : case ' f' : return 15 ;
1364+ default :
1365+ [NSException raise: @" Invalid percent-encoded string" format: @" %c is not a hex digit" , c];
1366+ }
1367+ }
1368+
1369+ /* *
1370+ * Decode a percent-encoded UTF-8 string into a string of UTF-16 code units
1371+ * This is more permissive than `stringByRemovingPercentEncoding` because
1372+ * that fails if the input represents invalid Unicode characters. However, different
1373+ * diff-match-patch libraries may encode surrogate halves as if they were valid
1374+ * Unicode code points. Therefore, instead of failing or corrupting the output, which
1375+ * `stringByRemovingPercentEncoding` does when it inserts "(null)" in these places
1376+ * we can decode it anyway and then once the string is reconstructed from the diffs
1377+ * we'll end up with valid Unicode again, after the surrogate halves are re-joined
1378+ */
1379+ - (NSString *)diff_decodeURIWithText : (NSString *)percentEncoded
1380+ {
1381+ unichar decoded[[percentEncoded length ]];
1382+ NSInteger input = 0 ;
1383+ NSInteger output = 0 ;
1384+
1385+ @try {
1386+ while (input < [percentEncoded length ]) {
1387+ unichar c = [percentEncoded characterAtIndex: input];
1388+
1389+ // not special, so just return it
1390+ if (' %' != c) {
1391+ decoded[output++] = c;
1392+ input += 1 ;
1393+ continue ;
1394+ }
1395+
1396+ NSUInteger byte1 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+1 )]] << 4 ) +
1397+ [self diff_digit16: [percentEncoded characterAtIndex: (input+2 )]];
1398+
1399+ // single-byte UTF-8 first byte has bitmask 0xxx xxxx
1400+ if ((byte1 & 0x80 ) == 0 ) {
1401+ decoded[output++] = byte1;
1402+ input += 3 ;
1403+ continue ;
1404+ }
1405+
1406+ // at least one continuation byte
1407+ if (' %' != [percentEncoded characterAtIndex: (input + 3 )]) {
1408+ return nil ;
1409+ }
1410+
1411+ NSUInteger byte2 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+4 )]] << 4 ) +
1412+ [self diff_digit16: [percentEncoded characterAtIndex: (input+5 )]];
1413+
1414+ // continuation bytes have bitmask 10xx xxxx
1415+ if ((byte2 & 0xC0 ) != 0x80 ) {
1416+ return nil ;
1417+ }
1418+
1419+ // continuation bytes thus only contribute six bits each
1420+ // these data bits are found with the bit mask xx11 1111
1421+ byte2 = byte2 & 0x3F ;
1422+
1423+ // in two-byte sequences the first byte has bitmask 110x xxxx
1424+ if ((byte1 & 0xE0 ) == 0xC0 ) {
1425+ // byte1 ___x xxxx << 6
1426+ // byte2 __yy yyyy
1427+ // value x xxxxyy yyyy -> 11 bits
1428+ decoded[output++] = ((byte1 & 0x1F ) << 6 ) | byte2;
1429+ input += 6 ;
1430+ continue ;
1431+ }
1432+
1433+ // at least two continuation bytes
1434+ if (' %' != [percentEncoded characterAtIndex: (input + 6 )]) {
1435+ return nil ;
1436+ }
1437+
1438+ NSUInteger byte3 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+7 )]] << 4 ) +
1439+ [self diff_digit16: [percentEncoded characterAtIndex: (input+8 )]];
1440+
1441+ if ((byte3 & 0xC0 ) != 0x80 ) {
1442+ return nil ;
1443+ }
1444+
1445+ byte3 = byte3 & 0x3F ;
1446+
1447+ // in three-byte sequences the first byte has bitmask 1110 xxxx
1448+ if ((byte1 & 0xF0 ) == 0xE0 ) {
1449+ // byte1 ____ xxxx << 12
1450+ // byte2 __yy yyyy << 6
1451+ // byte3 __zz zzzz
1452+ // value xxxxyy yyyyzz zzzz -> 16 bits
1453+ decoded[output++] = ((byte1 & 0x0F ) << 12 ) | (byte2 << 6 ) | byte3;
1454+ input += 9 ;
1455+ continue ;
1456+ }
1457+
1458+ // three continuation bytes
1459+ if (' %' != [percentEncoded characterAtIndex: (input + 9 )]) {
1460+ return nil ;
1461+ }
1462+
1463+ NSUInteger byte4 = ([self diff_digit16: [percentEncoded characterAtIndex: (input+10 )]] << 4 ) +
1464+ [self diff_digit16: [percentEncoded characterAtIndex: (input+11 )]];
1465+
1466+ if ((byte4 & 0xC0 ) != 0x80 ) {
1467+ return nil ;
1468+ }
1469+
1470+ byte4 = byte4 & 0x3F ;
1471+
1472+ // in four-byte sequences the first byte has bitmask 1111 0xxx
1473+ if ((byte1 & 0xF8 ) == 0xF0 ) {
1474+ // byte1 ____ _xxx << 18
1475+ // byte2 __yy yyyy << 12
1476+ // byte3 __zz zzzz << 6
1477+ // byte4 __tt tttt
1478+ // value xxxyy yyyyzz zzzztt tttt -> 21 bits
1479+ NSUInteger codePoint = ((byte1 & 0x07 ) << 0x12 ) | (byte2 << 0x0C ) | (byte3 << 0x06 ) | byte4;
1480+ if (codePoint >= 0x010000 && codePoint <= 0x10FFFF ) {
1481+ codePoint -= 0x010000 ;
1482+ decoded[output++] = ((codePoint >> 10 ) & 0x3FF ) | 0xD800 ;
1483+ decoded[output++] = 0xDC00 | (codePoint & 0x3FF );
1484+ input += 12 ;
1485+ continue ;
1486+ }
1487+ }
1488+
1489+ return nil ;
1490+ }
1491+ }
1492+ @catch (NSException *e) {
1493+ return nil ;
1494+ }
1495+
1496+ // some objective-c versions of the library produced patches with
1497+ // (null) in the place where surrogates were split across diff
1498+ // boundaries. if we leave those in we'll be stuck with a
1499+ // high-surrogate (null) low-surrogate pattern that will break
1500+ // deeper in the library or consuming application. we'll "fix"
1501+ // these by dropping the (null) and re-joining the surrogate halves
1502+ NSString *result = [NSString stringWithCharacters: decoded length: output];
1503+ NSRegularExpression *replacer = [NSRegularExpression
1504+ regularExpressionWithPattern: @" ([\\ x{D800}-\\ x{DBFF}])\\ (null\\ )([\\ x{DC00}-\\ x{DFFF}])"
1505+ options: 0
1506+ error: nil ];
1507+
1508+ return [replacer
1509+ stringByReplacingMatchesInString: result
1510+ options: 0
1511+ range: NSMakeRange (0 , [result length ])
1512+ withTemplate: @" $1$2" ];
1513+ }
1514+
13241515/* *
13251516 * Given the original text1, and an encoded NSString which describes the
13261517 * operations required to transform text1 into text2, compute the full diff.
@@ -1348,7 +1539,7 @@ - (NSMutableArray *)diff_fromDeltaWithText:(NSString *)text1
13481539 NSString *param = [token substringFromIndex: 1 ];
13491540 switch ([token characterAtIndex: 0 ]) {
13501541 case ' +' :
1351- param = [param diff_stringByReplacingPercentEscapesForEncodeUriCompatibility ];
1542+ param = [self diff_decodeURIWithText: param ];
13521543 if (param == nil ) {
13531544 if (error != NULL ) {
13541545 errorDetail = [NSDictionary dictionaryWithObjectsAndKeys:
0 commit comments