@@ -116,10 +116,11 @@ while (TRUE)
116116
117117#ifdef SUPPORT_UNICODE
118118
119- #define PARSE_CLASS_UTF 0x1
120- #define PARSE_CLASS_CASELESS_UTF 0x2
121- #define PARSE_CLASS_RESTRICTED_UTF 0x4
122- #define PARSE_CLASS_TURKISH_UTF 0x8
119+ #define PARSE_CLASS_UTF 0x01
120+ #define PARSE_CLASS_CASELESS_UTF 0x02
121+ #define PARSE_CLASS_RESTRICTED_UTF 0x04
122+ #define PARSE_CLASS_TURKISH_UTF 0x08
123+ #define PARSE_CLASS_COMPUTE_CATLIST 0x10
123124
124125/* Get the range of nocase characters which includes the
125126'c' character passed as argument, or directly follows 'c'. */
@@ -358,13 +359,28 @@ append_non_ascii_range(uint32_t options, uint32_t *buffer)
358359 return buffer + 2 ;
359360}
360361
362+ /* The buffer may represent the categry list pointer when utf is enabled. */
361363static size_t
362364parse_class (uint32_t * ptr , uint32_t options , uint32_t * buffer )
363365{
364366size_t total_size = 0 ;
365367size_t size ;
366368uint32_t meta_arg ;
367369uint32_t start_char ;
370+ uint32_t ptype ;
371+ #ifdef SUPPORT_UNICODE
372+ uint32_t pdata ;
373+ uint32_t category_list ;
374+ uint32_t * pcategory_list = NULL ;
375+ #endif
376+
377+ #ifdef SUPPORT_UNICODE
378+ if ((options & PARSE_CLASS_COMPUTE_CATLIST ) != 0 )
379+ {
380+ pcategory_list = buffer ;
381+ buffer = NULL ;
382+ }
383+ #endif
368384
369385while (TRUE)
370386 {
@@ -408,7 +424,8 @@ while (TRUE)
408424 case ESC_p :
409425 case ESC_P :
410426 ptr ++ ;
411- if (meta_arg == ESC_p && (* ptr >> 16 ) == PT_ANY )
427+ ptype = (* ptr >> 16 );
428+ if (meta_arg == ESC_p && ptype == PT_ANY )
412429 {
413430 if (buffer != NULL )
414431 {
@@ -418,6 +435,43 @@ while (TRUE)
418435 }
419436 total_size += 2 ;
420437 }
438+ #ifdef SUPPORT_UNICODE
439+ if (pcategory_list == NULL ) break ;
440+
441+ category_list = 0 ;
442+
443+ switch (ptype )
444+ {
445+ case PT_LAMP :
446+ category_list = UCPCAT3 (ucp_Lu , ucp_Ll , ucp_Lt );
447+ break ;
448+
449+ case PT_GC :
450+ pdata = * ptr & 0xffff ;
451+ category_list = UCPCAT_RANGE (PRIV (ucp_typerange )[pdata ],
452+ PRIV (ucp_typerange )[pdata + 1 ] - 1 );
453+ break ;
454+
455+ case PT_PC :
456+ pdata = * ptr & 0xffff ;
457+ category_list = UCPCAT (pdata );
458+ break ;
459+
460+ case PT_WORD :
461+ category_list = UCPCAT2 (ucp_Mn , ucp_Pc ) | UCPCAT_L | UCPCAT_N ;
462+ break ;
463+
464+ case PT_ALNUM :
465+ category_list = UCPCAT_L | UCPCAT_N ;
466+ break ;
467+ }
468+
469+ if (category_list > 0 )
470+ {
471+ if (meta_arg == ESC_P ) category_list ^= UCPCAT_ALL ;
472+ * pcategory_list |= category_list ;
473+ }
474+ #endif
421475 break ;
422476 }
423477 ptr ++ ;
@@ -512,6 +566,9 @@ const uint32_t *char_list_next;
512566uint16_t * next_char ;
513567uint32_t char_list_start , char_list_end ;
514568uint32_t range_start , range_end ;
569+ #ifdef SUPPORT_UNICODE
570+ uint32_t category_list = 0 ;
571+ #endif
515572
516573#ifdef SUPPORT_UNICODE
517574if (options & PCRE2_UTF )
@@ -529,11 +586,21 @@ if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
529586
530587/* Compute required space for the range. */
531588
589+ #ifdef SUPPORT_UNICODE
590+ range_list_size = parse_class (start_ptr ,
591+ class_options | PARSE_CLASS_COMPUTE_CATLIST ,
592+ & category_list );
593+ #else
532594range_list_size = parse_class (start_ptr , class_options , NULL );
595+ #endif
533596PCRE2_ASSERT ((range_list_size & 0x1 ) == 0 );
534597
535598/* Allocate buffer. The total_size also represents the end of the buffer. */
536599
600+ #ifdef SUPPORT_UNICODE
601+ if (category_list == UCPCAT_ALL ) range_list_size = 2 ;
602+ #endif
603+
537604total_size = range_list_size +
538605 ((range_list_size >= 2 ) ? CHAR_LIST_EXTRA_SIZE : 0 );
539606
@@ -548,6 +615,21 @@ cranges->range_list_size = (uint16_t)range_list_size;
548615cranges -> char_lists_types = 0 ;
549616cranges -> char_lists_size = 0 ;
550617cranges -> char_lists_start = 0 ;
618+ #ifdef SUPPORT_UNICODE
619+ cranges -> category_list = category_list ;
620+ #endif
621+
622+ #ifdef SUPPORT_UNICODE
623+ if (category_list == UCPCAT_ALL )
624+ {
625+ /* Replace the xclass with OP_ALLANY. */
626+ cranges -> category_list = 0 ;
627+ buffer = (uint32_t * )(cranges + 1 );
628+ buffer [0 ] = 0 ;
629+ buffer [1 ] = get_highest_char (options );
630+ return cranges ;
631+ }
632+ #endif
551633
552634if (range_list_size == 0 ) return cranges ;
553635
@@ -1042,6 +1124,7 @@ BOOL utf = FALSE;
10421124
10431125#ifdef SUPPORT_WIDE_CHARS
10441126uint32_t xclass_props ;
1127+ uint32_t category_list ;
10451128PCRE2_UCHAR * class_uchardata ;
10461129class_ranges * cranges ;
10471130#endif
@@ -1058,6 +1141,7 @@ should_flip_negation = FALSE;
10581141
10591142#ifdef SUPPORT_WIDE_CHARS
10601143xclass_props = 0 ;
1144+ category_list = 0 ;
10611145
10621146#if PCRE2_CODE_UNIT_WIDTH == 8
10631147cranges = NULL ;
@@ -1091,6 +1175,9 @@ if (utf)
10911175 cb -> cranges = cranges -> next ;
10921176 }
10931177
1178+ category_list = cranges -> category_list ;
1179+ PCRE2_ASSERT (category_list != UCPCAT_ALL );
1180+
10941181 if (cranges -> range_list_size > 0 )
10951182 {
10961183 const uint32_t * ranges = (const uint32_t * )(cranges + 1 );
@@ -1105,6 +1192,13 @@ if (utf)
11051192 }
11061193
11071194class_uchardata = code + LINK_SIZE + 2 ; /* For XCLASS items */
1195+
1196+ if (cranges != NULL && category_list != 0 &&
1197+ (xclass_props & XCLASS_HIGH_ANY ) == 0 )
1198+ {
1199+ xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS ;
1200+ class_uchardata += sizeof (uint32_t ) / sizeof (PCRE2_UCHAR );
1201+ }
11081202#endif /* SUPPORT_WIDE_CHARS */
11091203
11101204/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
@@ -1380,7 +1474,9 @@ while (TRUE)
13801474
13811475 PRIV (update_classbits )(ptype , pdata , (escape == ESC_P ), classbits );
13821476
1383- if ((xclass_props & XCLASS_HIGH_ANY ) == 0 )
1477+ if ((xclass_props & XCLASS_HIGH_ANY ) == 0 &&
1478+ ptype != PT_LAMP && ptype != PT_GC && ptype != PT_PC &&
1479+ ptype != PT_WORD && ptype != PT_ALNUM )
13841480 {
13851481 if (lengthptr != NULL )
13861482 * lengthptr += 3 ;
@@ -1640,6 +1736,12 @@ if ((xclass_props & XCLASS_REQUIRED) != 0)
16401736 code += LINK_SIZE ;
16411737 * code = negate_class ? XCL_NOT :0 ;
16421738 if ((xclass_props & XCLASS_HAS_PROPS ) != 0 ) * code |= XCL_HASPROP ;
1739+ /* This should be the last one. */
1740+ if (category_list != 0 )
1741+ {
1742+ * code |= XCL_HASCATLIST ;
1743+ memmove (code + 1 , & category_list , sizeof (uint32_t ));
1744+ }
16431745
16441746 /* If the map is required, move up the extra data to make room for it;
16451747 otherwise just move the code pointer to the end of the extra data. */
0 commit comments