22Contains the main functionality of the JSONSchemaLexer.
33"""
44
5- from typing import ClassVar
5+ from importlib .resources import files
6+ from pathlib import Path
7+ from typing import Any , ClassVar
8+ import json
69
710from pygments .lexers .data import ( # type: ignore[reportMissingTypeStubs]
811 JsonLexer ,
@@ -24,109 +27,159 @@ class JSONSchemaLexer(JsonLexer):
2427 ]
2528
2629 data_types : ClassVar [list [str ]] = [
27- "object" ,
28- "integer" ,
29- "string" ,
30- "number" ,
31- "array" ,
32- "boolean" ,
33- "null" ,
34- ]
35- core_keywords : ClassVar [list [str ]] = [
36- "$schema" ,
37- "$id" ,
38- "$ref" ,
39- "$defs" ,
40- "$comment" ,
41- "$dynamicAnchor" ,
42- "$dynamicRef" ,
43- "$anchor" ,
44- "$vocabulary" ,
45- ]
46- applicator_keywords : ClassVar [list [str ]] = [
47- "oneOf" ,
48- "allOf" ,
49- "anyOf" ,
50- "if" ,
51- "then" ,
52- "else" ,
53- "not" ,
54- "properties" ,
55- "patternProperties" ,
56- "additionalProperties" ,
57- "dependentSchemas" ,
58- "propertyNames" ,
59- "prefixItems" ,
60- "contains" ,
61- "items" ,
62- ]
63- meta_data_keywords : ClassVar [list [str ]] = [
64- "title" ,
65- "description" ,
66- "default" ,
67- "deprecated" ,
68- "examples" ,
69- "readOnly" ,
70- "writeOnly" ,
71- ]
72- validation_keywords : ClassVar [list [str ]] = [
73- "type" ,
74- "enum" ,
75- "const" ,
76- "minLength" ,
77- "maxLength" ,
78- "pattern" ,
79- "maximum" ,
80- "exclusiveMinimum" ,
81- "multipleOf" ,
82- "exclusiveMaximum" ,
83- "minimum" ,
84- "dependentRequired" ,
85- "minProperties" ,
86- "maxProperties" ,
87- "required" ,
88- "minItems" ,
89- "maxItems" ,
90- "minContains" ,
91- "maxContains" ,
92- "uniqueItems" ,
93- ]
94- other_keywords : ClassVar [list [str ]] = [
95- "format" ,
96- "unevaluatedItems" ,
97- "unevaluatedProperties" ,
98- "contentEncoding" ,
99- "contentMediaType" ,
100- "contentSchema" ,
101- "format_assertion" ,
30+ '"object"' ,
31+ '"integer"' ,
32+ '"string"' ,
33+ '"number"' ,
34+ '"array"' ,
35+ '"boolean"' ,
36+ '"null"' ,
10237 ]
38+ keywords : ClassVar [dict [str | None , list [str ]]] = {}
39+ identifier : ClassVar [dict [str | None , str ]] = {}
40+ default_dialect = None
41+
42+ def __init__ (self , default_dialect : str | None = None ):
43+ super ().__init__ () # type: ignore[reportUnknownMemberType]
44+ self ._populate_keywords_and_identifiers ()
45+ if default_dialect and default_dialect [0 ] != '"' :
46+ default_dialect = '"' + default_dialect
47+
48+ if default_dialect and default_dialect [- 1 ] != '"' :
49+ default_dialect = default_dialect + '"'
10350
104- parsed_keywords : ClassVar [list [str ]] = [
105- '"%s"' % keyword
106- for keyword in (
107- core_keywords
108- + applicator_keywords
109- + meta_data_keywords
110- + validation_keywords
111- + other_keywords
51+ self .default_dialect = default_dialect
52+
53+ def _populate_keywords_and_identifiers (self ):
54+ dialect_files = files ("jsonschema_lexer" ) / "data" / "keywords"
55+ if not dialect_files .is_dir ():
56+ dialect_files = Path (__file__ ).parent / "data" / "keywords"
57+ for dialect_file in dialect_files .iterdir ():
58+ with dialect_file .open () as file :
59+ json_content = json .load (file )
60+ dialect_name = f'"{ json_content ["dialect" ]} "'
61+ self .keywords [dialect_name ] = json_content ["keywords" ]
62+ self .identifier [dialect_name ] = (
63+ f'"{ json_content ["identifier" ]} "'
64+ )
65+
66+ def _find_rightmost_token_index (
67+ self ,
68+ syntax_stack : list [tuple [int , str ]],
69+ token : str | None ,
70+ ):
71+ return next (
72+ (
73+ i
74+ for i , (_ , t ) in reversed (list (enumerate (syntax_stack )))
75+ if t == token
76+ ),
77+ None ,
11278 )
113- ]
11479
115- parsed_data_types : ClassVar [list [str ]] = [
116- '"%s"' % data_type for data_type in data_types
117- ]
80+ def _find_key_value_from_json (
81+ self ,
82+ tokens : list [tuple [int , Any , str ]],
83+ index : int ,
84+ ):
85+ return next (
86+ (t [2 ] for t in tokens [index :] if t [1 ] is Token .String .Double ),
87+ None ,
88+ )
89+
90+ def _get_nearest_valid_dialect (
91+ self ,
92+ tokens : list [tuple [int , Any , str ]],
93+ syntax_stack : list [tuple [int , str ]],
94+ index : int | None = None ,
95+ ) -> str | None :
96+ if not index :
97+ index = len (syntax_stack ) - 1
98+
99+ nearest_schema_index = self ._find_rightmost_token_index (
100+ syntax_stack [: index + 1 ],
101+ '"$schema"' ,
102+ )
103+ if nearest_schema_index :
104+ dialect = self ._find_key_value_from_json (
105+ tokens ,
106+ nearest_schema_index ,
107+ )
108+ identifier = self .identifier .get (dialect , None )
109+ is_dialect_valid = bool (
110+ identifier or syntax_stack [nearest_schema_index ][0 ] == 0 ,
111+ )
112+ nearest_identifier_index = self ._find_rightmost_token_index (
113+ syntax_stack [: index + 1 ],
114+ identifier ,
115+ )
116+ if (
117+ nearest_identifier_index
118+ and identifier
119+ and syntax_stack [nearest_identifier_index ][0 ]
120+ == syntax_stack [nearest_schema_index ][0 ]
121+ ) or syntax_stack [nearest_schema_index ][0 ] == 0 :
122+ return dialect
123+ elif is_dialect_valid and nearest_identifier_index :
124+ return self ._get_nearest_valid_dialect (
125+ tokens ,
126+ syntax_stack ,
127+ nearest_identifier_index - 1 ,
128+ )
129+ elif is_dialect_valid and syntax_stack [- 1 ][1 ] not in (
130+ '"$id"' ,
131+ '"id"' ,
132+ ):
133+ return self ._get_nearest_valid_dialect (
134+ tokens ,
135+ syntax_stack ,
136+ nearest_schema_index - 1 ,
137+ )
138+
139+ if self .default_dialect :
140+ return self .default_dialect
141+
142+ return None
143+
144+ def _parse_token_tuple (
145+ self ,
146+ token_tuple : tuple [int , Any , str ],
147+ keywords : list [str ],
148+ ):
149+ start , token , value = token_tuple
150+ keywords = ['"%s"' % keyword for keyword in (keywords )]
151+ if token is Token .Name .Tag and value in keywords :
152+ return start , Token .Keyword , value
153+ elif token is Token .String .Double and value in self .data_types :
154+ return start , Token .Name .Decorator , value
155+ else :
156+ return start , token , value
157+
158+ def map_tokens_by_schema (self , tokens : list [tuple [int , Any , str ]]):
159+ syntax_stack : list [tuple [int , str ]] = []
160+ cur_depth = - 1
161+ for start , token , value in tokens :
162+ if value == "{" :
163+ cur_depth += 1
164+
165+ syntax_stack .append ((cur_depth , value ))
166+
167+ if value == "}" :
168+ while syntax_stack .pop ()[1 ] != "{" :
169+ continue
170+ yield self ._parse_token_tuple ((start , token , value ), [])
171+ else :
172+ dialect = self ._get_nearest_valid_dialect (tokens , syntax_stack )
173+ yield self ._parse_token_tuple (
174+ (start , token , value ),
175+ self .keywords .get (dialect , []),
176+ )
118177
119178 def get_tokens_unprocessed (self , text : str ): # type: ignore[reportUnknownParameterType]
120179 """
121180 Add token classes to it according to JSON Schema.
122181 """
123- for start , token , value in super ().get_tokens_unprocessed (text ): # type: ignore[reportUnknownVariableType]
124- if token is Token .Name .Tag and value in self .parsed_keywords :
125- yield start , Token .Keyword , value
126- elif (
127- token is Token .String .Double
128- and value in self .parsed_data_types
129- ):
130- yield start , Token .Name .Decorator , value
131- else :
132- yield start , token , value
182+ json_tokens : list [tuple [int , Any , str ]] = list (
183+ super ().get_tokens_unprocessed (text ), # type: ignore[reportUnknownParameterType]
184+ )
185+ yield from self .map_tokens_by_schema (json_tokens )
0 commit comments