@@ -115,10 +115,11 @@ while (TRUE)
115115
116116#ifdef SUPPORT_UNICODE
117117
118- #define PARSE_CLASS_UTF 0x1
119- #define PARSE_CLASS_CASELESS_UTF 0x2
120- #define PARSE_CLASS_RESTRICTED_UTF 0x4
121- #define PARSE_CLASS_TURKISH_UTF 0x8
118+ #define PARSE_CLASS_UTF 0x01
119+ #define PARSE_CLASS_CASELESS_UTF 0x02
120+ #define PARSE_CLASS_RESTRICTED_UTF 0x04
121+ #define PARSE_CLASS_TURKISH_UTF 0x08
122+ #define PARSE_CLASS_COMPUTE_CATLIST 0x10
122123
123124/* Get the range of nocase characters which includes the
124125'c' character passed as argument, or directly follows 'c'. */
@@ -357,13 +358,28 @@ append_non_ascii_range(uint32_t options, uint32_t *buffer)
357358 return buffer + 2 ;
358359}
359360
361+ /* The buffer may represent the categry list pointer when utf is enabled. */
360362static size_t
361363parse_class (uint32_t * ptr , uint32_t options , uint32_t * buffer )
362364{
363365size_t total_size = 0 ;
364366size_t size ;
365367uint32_t meta_arg ;
366368uint32_t start_char ;
369+ uint32_t ptype ;
370+ #ifdef SUPPORT_UNICODE
371+ uint32_t pdata ;
372+ uint32_t category_list ;
373+ uint32_t * pcategory_list = NULL ;
374+ #endif
375+
376+ #ifdef SUPPORT_UNICODE
377+ if ((options & PARSE_CLASS_COMPUTE_CATLIST ) != 0 )
378+ {
379+ pcategory_list = buffer ;
380+ buffer = NULL ;
381+ }
382+ #endif
367383
368384while (TRUE)
369385 {
@@ -407,7 +423,8 @@ while (TRUE)
407423 case ESC_p :
408424 case ESC_P :
409425 ptr ++ ;
410- if (meta_arg == ESC_p && (* ptr >> 16 ) == PT_ANY )
426+ ptype = (* ptr >> 16 );
427+ if (meta_arg == ESC_p && ptype == PT_ANY )
411428 {
412429 if (buffer != NULL )
413430 {
@@ -417,6 +434,43 @@ while (TRUE)
417434 }
418435 total_size += 2 ;
419436 }
437+ #ifdef SUPPORT_UNICODE
438+ if (pcategory_list == NULL ) break ;
439+
440+ category_list = 0 ;
441+
442+ switch (ptype )
443+ {
444+ case PT_LAMP :
445+ category_list = UCPCAT3 (ucp_Lu , ucp_Ll , ucp_Lt );
446+ break ;
447+
448+ case PT_GC :
449+ pdata = * ptr & 0xffff ;
450+ category_list = UCPCAT_RANGE (PRIV (ucp_typerange )[pdata ],
451+ PRIV (ucp_typerange )[pdata + 1 ] - 1 );
452+ break ;
453+
454+ case PT_PC :
455+ pdata = * ptr & 0xffff ;
456+ category_list = UCPCAT (pdata );
457+ break ;
458+
459+ case PT_WORD :
460+ category_list = UCPCAT2 (ucp_Mn , ucp_Pc ) | UCPCAT_L | UCPCAT_N ;
461+ break ;
462+
463+ case PT_ALNUM :
464+ category_list = UCPCAT_L | UCPCAT_N ;
465+ break ;
466+ }
467+
468+ if (category_list > 0 )
469+ {
470+ if (meta_arg == ESC_P ) category_list ^= UCPCAT_ALL ;
471+ * pcategory_list |= category_list ;
472+ }
473+ #endif
420474 break ;
421475 }
422476 ptr ++ ;
@@ -511,6 +565,9 @@ const uint32_t *char_list_next;
511565uint16_t * next_char ;
512566uint32_t char_list_start , char_list_end ;
513567uint32_t range_start , range_end ;
568+ #ifdef SUPPORT_UNICODE
569+ uint32_t category_list = 0 ;
570+ #endif
514571
515572#ifdef SUPPORT_UNICODE
516573if (options & PCRE2_UTF )
@@ -531,11 +588,22 @@ if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
531588
532589/* Compute required space for the range. */
533590
591+ #ifdef SUPPORT_UNICODE
592+ range_list_size = parse_class (start_ptr ,
593+ class_options | PARSE_CLASS_COMPUTE_CATLIST ,
594+ & category_list );
595+ #else
534596range_list_size = parse_class (start_ptr , class_options , NULL );
597+ #endif
535598PCRE2_ASSERT ((range_list_size & 0x1 ) == 0 );
536599
537600/* Allocate buffer. The total_size also represents the end of the buffer. */
538601
602+ #ifdef SUPPORT_UNICODE
603+ /* Replaced by an OP_ALLANY. */
604+ if (category_list == UCPCAT_ALL ) range_list_size = 2 ;
605+ #endif
606+
539607total_size = range_list_size +
540608 ((range_list_size >= 2 ) ? CHAR_LIST_EXTRA_SIZE : 0 );
541609
@@ -553,6 +621,21 @@ cranges->range_list_size = (uint16_t)range_list_size;
553621cranges -> char_lists_types = 0 ;
554622cranges -> char_lists_size = 0 ;
555623cranges -> char_lists_start = 0 ;
624+ #ifdef SUPPORT_UNICODE
625+ cranges -> category_list = category_list ;
626+ #endif
627+
628+ #ifdef SUPPORT_UNICODE
629+ if (category_list == UCPCAT_ALL )
630+ {
631+ /* Replace the xclass with OP_ALLANY. */
632+ cranges -> category_list = 0 ;
633+ buffer = (uint32_t * )(cranges + 1 );
634+ buffer [0 ] = 0 ;
635+ buffer [1 ] = get_highest_char (class_options );
636+ return cranges ;
637+ }
638+ #endif
556639
557640if (range_list_size == 0 ) return cranges ;
558641
@@ -1087,6 +1170,7 @@ BOOL utf = FALSE;
10871170
10881171#ifdef SUPPORT_WIDE_CHARS
10891172uint32_t xclass_props ;
1173+ uint32_t category_list ;
10901174PCRE2_UCHAR * class_uchardata ;
10911175class_ranges * cranges ;
10921176#else
@@ -1107,6 +1191,7 @@ should_flip_negation = FALSE;
11071191
11081192#ifdef SUPPORT_WIDE_CHARS
11091193xclass_props = 0 ;
1194+ category_list = 0 ;
11101195
11111196#if PCRE2_CODE_UNIT_WIDTH == 8
11121197cranges = NULL ;
@@ -1140,6 +1225,9 @@ if (utf)
11401225 cb -> first_data = cranges -> header .next ;
11411226 }
11421227
1228+ category_list = cranges -> category_list ;
1229+ PCRE2_ASSERT (category_list != UCPCAT_ALL );
1230+
11431231 if (cranges -> range_list_size > 0 )
11441232 {
11451233 const uint32_t * ranges = (const uint32_t * )(cranges + 1 );
@@ -1154,6 +1242,13 @@ if (utf)
11541242 }
11551243
11561244class_uchardata = code + LINK_SIZE + 2 ; /* For XCLASS items */
1245+
1246+ if (cranges != NULL && category_list != 0 &&
1247+ (xclass_props & XCLASS_HIGH_ANY ) == 0 )
1248+ {
1249+ xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS ;
1250+ class_uchardata += sizeof (uint32_t ) / sizeof (PCRE2_UCHAR );
1251+ }
11571252#endif /* SUPPORT_WIDE_CHARS */
11581253
11591254/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
@@ -1444,7 +1539,9 @@ while (TRUE)
14441539
14451540 PRIV (update_classbits )(ptype , pdata , (escape == ESC_P ), classbits );
14461541
1447- if ((xclass_props & XCLASS_HIGH_ANY ) == 0 )
1542+ if ((xclass_props & XCLASS_HIGH_ANY ) == 0 &&
1543+ ptype != PT_LAMP && ptype != PT_GC && ptype != PT_PC &&
1544+ ptype != PT_WORD && ptype != PT_ALNUM )
14481545 {
14491546 if (lengthptr != NULL )
14501547 * lengthptr += 3 ;
@@ -1709,6 +1806,15 @@ if ((xclass_props & XCLASS_REQUIRED) != 0)
17091806 * code = negate_class ? XCL_NOT :0 ;
17101807 if ((xclass_props & XCLASS_HAS_PROPS ) != 0 ) * code |= XCL_HASPROP ;
17111808
1809+ /* The category_list is placed after the class feature bitset.
1810+ The code pointer is not increased, because the bitset for the
1811+ first 256 characters may be injected after the feature bitset. */
1812+ if (category_list != 0 )
1813+ {
1814+ * code |= XCL_HASCATLIST ;
1815+ memmove (code + 1 , & category_list , sizeof (uint32_t ));
1816+ }
1817+
17121818 /* If the map is required, move up the extra data to make room for it;
17131819 otherwise just move the code pointer to the end of the extra data. */
17141820
0 commit comments