Skip to content

Commit 2032c19

Browse files
orgadsamitguptagwl
andcommitted
Change regexp to match the XML spec, supporting Unicode (#205)
Replace the regexp with one that matches the XML spec[1] (except [\u10000-\uEFFFF] which matches digits for some reason...). Remove the localeRange option, which is no longer needed [1] https://www.w3.org/TR/xml/#NT-NameStartChar Co-authored-by: Amit K Gupta <amitguptagwl@users.noreply.github.com>
1 parent e26752b commit 2032c19

File tree

6 files changed

+29
-61
lines changed

6 files changed

+29
-61
lines changed

spec/validator_utf8_with_BOM_spec.js

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,10 @@
33
const validator = require("../src/validator");
44

55
describe("XMLParser", function() {
6-
76
it("should validate xml string with cyrillic characters", function() {
87
const BOM = "\ufeff";
9-
const options = {localeRange: "a-zA-Zа-яёА-ЯЁ"}
108
let xmlData = BOM + "<?xml version=\"1.0\" encoding=\"utf-8\" ?><КорневаяЗапись><Тэг>ЗначениеValue53456</Тэг></КорневаяЗапись>";
11-
let result = validator.validate(xmlData, options);
9+
let result = validator.validate(xmlData);
1210
expect(result).toBe(true);
1311

1412
});

spec/x_cyrillic_2j_str_spec.js

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"use strict";
22

33
const parser = require("../src/parser");
4-
const validator = require("../src/validator");
54

65
describe("XMLParser", function() {
76

@@ -13,29 +12,12 @@ describe("XMLParser", function() {
1312
}
1413
};
1514
const options = {
16-
localeRange: "а-яёА-ЯЁa-zA-Z",
1715
attributeNamePrefix : "@_"
1816
}
1917

20-
const result = parser.parse(xmlData, options, { localeRange: "а-яёА-ЯЁa-zA-Z" });
18+
const result = parser.parse(xmlData, options);
2119
expect(result).toEqual(expected);
2220
// console.log({ expected})
2321
// console.log({ result })
2422
});
25-
26-
it("should invalid XML with invalid localRange", function() {
27-
const xmlData = `<КорневаяЗапись><Тэг>ЗначениеValue53456</Тэг></КорневаяЗапись>`;
28-
29-
const expected = {
30-
"code": "InvalidOptions",
31-
"msg": "Invalid localeRange",
32-
"line": 1
33-
};
34-
35-
const result = validator.validate(xmlData , { localeRange: "а-яёА-ЯЁa-zA-Z<" }).err
36-
expect(result).toEqual(expected);
37-
// console.log({ expected})
38-
// console.log({ result })
39-
});
40-
4123
});

src/parser.d.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ type X2jOptions = {
1111
trimValues: boolean;
1212
cdataTagName: false | string;
1313
cdataPositionChar: string;
14-
localeRange: string;
1514
parseTrueNumberOnly: boolean;
1615
tagValueProcessor: (tagValue: string, tagName: string) => string;
1716
attrValueProcessor: (attrValue: string, attrName: string) => string;
@@ -20,7 +19,6 @@ type X2jOptions = {
2019
type X2jOptionsOptional = Partial<X2jOptions>;
2120
type validationOptions = {
2221
allowBooleanAttributes: boolean;
23-
localeRange: string;
2422
};
2523
type validationOptionsOptional = Partial<validationOptions>;
2624
type J2xOptions = {

src/util.js

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
'use strict';
22

3+
const nameStartChar = ':A-Za-z_\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD';
4+
const nameChar = nameStartChar + '\\-.\\d\\u00B7\\u0300-\\u036F\\u203F-\\u2040';
5+
const nameRegexp = '[' + nameStartChar + '][' + nameChar + ']*'
6+
const regexName = new RegExp('^' + nameRegexp + '$');
7+
38
const getAllMatches = function(string, regex) {
49
const matches = [];
510
let match = regex.exec(string);
@@ -15,15 +20,11 @@ const getAllMatches = function(string, regex) {
1520
return matches;
1621
};
1722

18-
const doesMatch = function(string, regex) {
19-
const match = regex.exec(string);
23+
const isName = function(string) {
24+
const match = regexName.exec(string);
2025
return !(match === null || typeof match === 'undefined');
2126
};
2227

23-
const doesNotMatch = function(string, regex) {
24-
return !doesMatch(string, regex);
25-
};
26-
2728
exports.isExist = function(v) {
2829
return typeof v !== 'undefined';
2930
};
@@ -81,6 +82,6 @@ exports.buildOptions = function(options, defaultOptions, props) {
8182
return newOptions;
8283
};
8384

84-
exports.doesMatch = doesMatch;
85-
exports.doesNotMatch = doesNotMatch;
85+
exports.isName = isName;
8686
exports.getAllMatches = getAllMatches;
87+
exports.nameRegexp = nameRegexp;

src/validator.js

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,9 @@ const util = require('./util');
44

55
const defaultOptions = {
66
allowBooleanAttributes: false, //A tag can have attributes without any value
7-
localeRange: 'a-zA-Z',
87
};
98

10-
const props = ['allowBooleanAttributes', 'localeRange'];
9+
const props = ['allowBooleanAttributes'];
1110

1211
//const tagsPattern = new RegExp("<\\/?([\\w:\\-_\.]+)\\s*\/?>","g");
1312
exports.validate = function (xmlData, options) {
@@ -16,12 +15,6 @@ exports.validate = function (xmlData, options) {
1615
//xmlData = xmlData.replace(/(\r\n|\n|\r)/gm,"");//make it single line
1716
//xmlData = xmlData.replace(/(^\s*<\?xml.*?\?>)/g,"");//Remove XML starting tag
1817
//xmlData = xmlData.replace(/(<!DOCTYPE[\s\w\"\.\/\-\:]+(\[.*\])*\s*>)/g,"");//Remove DOCTYPE
19-
const localRangeRegex = new RegExp(`[${options.localeRange}]`);
20-
21-
if (localRangeRegex.test("<#$'\"\\\/:0")) {
22-
return getErrorObject('InvalidOptions', 'Invalid localeRange', 1);
23-
}
24-
2518
const tags = [];
2619
let tagFound = false;
2720

@@ -32,8 +25,7 @@ exports.validate = function (xmlData, options) {
3225
// check for byte order mark (BOM)
3326
xmlData = xmlData.substr(1);
3427
}
35-
const regxAttrName = new RegExp(`^[${options.localeRange}_][${options.localeRange}0-9_\\-\\.:]*$`);
36-
const regxTagName = new RegExp(`^([${options.localeRange}_])[${options.localeRange}0-9\\.\\-_:]*$`);
28+
3729
for (let i = 0; i < xmlData.length; i++) {
3830
if (xmlData[i] === '<') {
3931
//starting of tag
@@ -78,7 +70,7 @@ exports.validate = function (xmlData, options) {
7870
//continue;
7971
i--;
8072
}
81-
if (!validateTagName(tagName, regxTagName)) {
73+
if (!validateTagName(tagName)) {
8274
let msg;
8375
if(tagName.trim().length === 0) {
8476
msg = "There is an unnecessary space between tag name and backward slash '</ ..'.";
@@ -98,7 +90,7 @@ exports.validate = function (xmlData, options) {
9890
if (attrStr[attrStr.length - 1] === '/') {
9991
//self closing tag
10092
attrStr = attrStr.substring(0, attrStr.length - 1);
101-
const isValid = validateAttributeString(attrStr, options, regxAttrName);
93+
const isValid = validateAttributeString(attrStr, options);
10294
if (isValid === true) {
10395
tagFound = true;
10496
//continue; //text may presents after self closing tag
@@ -126,7 +118,7 @@ exports.validate = function (xmlData, options) {
126118
}
127119
}
128120
} else {
129-
const isValid = validateAttributeString(attrStr, options, regxAttrName);
121+
const isValid = validateAttributeString(attrStr, options);
130122
if (isValid !== true) {
131123
//the result from the nested function returns the position of the error within the attribute
132124
//in order to get the 'true' error line, we need to calculate the position where the attribute begins (i - attrStr.length) and then add the position within the attribute
@@ -303,7 +295,7 @@ const validAttrStrRegxp = new RegExp('(\\s*)([^\\s=]+)(\\s*=)?(\\s*([\'"])(([\\s
303295

304296
//attr, ="sd", a="amit's", a="sd"b="saf", ab cd=""
305297

306-
function validateAttributeString(attrStr, options, regxAttrName) {
298+
function validateAttributeString(attrStr, options) {
307299
//console.log("start:"+attrStr+":end");
308300

309301
//if(attrStr.trim().length === 0) return true; //empty string
@@ -323,7 +315,7 @@ function validateAttributeString(attrStr, options, regxAttrName) {
323315
return { err: { code:"InvalidAttr",msg:"attribute " + matches[i][2] + " has no value assigned."}};
324316
} */
325317
const attrName = matches[i][2];
326-
if (!validateAttrName(attrName, regxAttrName)) {
318+
if (!validateAttrName(attrName)) {
327319
return getErrorObject('InvalidAttr', `Attribute '${attrName}' is an invalid name.`, getPositionFromMatch(attrStr, matches[i][0]));
328320
}
329321
if (!attrNames.hasOwnProperty(attrName)) {
@@ -382,19 +374,18 @@ function getErrorObject(code, message, lineNumber) {
382374
};
383375
}
384376

385-
function validateAttrName(attrName, regxAttrName) {
386-
// const validAttrRegxp = new RegExp(regxAttrName);
387-
return util.doesMatch(attrName, regxAttrName);
377+
function validateAttrName(attrName) {
378+
return util.isName(attrName);
388379
}
389380

390381
//const startsWithXML = new RegExp("^[Xx][Mm][Ll]");
391382
// startsWith = /^([a-zA-Z]|_)[\w.\-_:]*/;
392383

393-
function validateTagName(tagname, regxTagName) {
384+
function validateTagName(tagname) {
394385
/*if(util.doesMatch(tagname,startsWithXML)) return false;
395386
else*/
396387
//return !tagname.toLowerCase().startsWith("xml") || !util.doesNotMatch(tagname, regxTagName);
397-
return !util.doesNotMatch(tagname, regxTagName);
388+
return util.isName(tagname);
398389
}
399390

400391
//this function returns the line number for the character at the given index

src/xmlstr2xmlnode.js

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@ const util = require('./util');
44
const buildOptions = require('./util').buildOptions;
55
const xmlNode = require('./xmlNode');
66
const TagType = {OPENING: 1, CLOSING: 2, SELF: 3, CDATA: 4};
7-
let regx =
8-
'<((!\\[CDATA\\[([\\s\\S]*?)(]]>))|(([\\w:\\-._]*:)?([\\w:\\-._]+))([^>]*)>|((\\/)(([\\w:\\-._]*:)?([\\w:\\-._]+))\\s*>))([^<]*)';
7+
const regx =
8+
'<((!\\[CDATA\\[([\\s\\S]*?)(]]>))|((NAME:)?(NAME))([^>]*)>|((\\/)(NAME)\\s*>))([^<]*)'
9+
.replace(/NAME/g, util.nameRegexp);
910

1011
//const tagsRegx = new RegExp("<(\\/?[\\w:\\-\._]+)([^>]*)>(\\s*"+cdataRegx+")*([^<]+)?","g");
1112
//const tagsRegx = new RegExp("<(\\/?)((\\w*:)?([\\w:\\-\._]+))([^>]*)>([^<]*)("+cdataRegx+"([^<]*))*([^<]+)?","g");
@@ -32,7 +33,6 @@ const defaultOptions = {
3233
trimValues: true, //Trim string values of tag and attributes
3334
cdataTagName: false,
3435
cdataPositionChar: '\\c',
35-
localeRange: '',
3636
tagValueProcessor: function(a, tagName) {
3737
return a;
3838
},
@@ -58,7 +58,6 @@ const props = [
5858
'trimValues',
5959
'cdataTagName',
6060
'cdataPositionChar',
61-
'localeRange',
6261
'tagValueProcessor',
6362
'attrValueProcessor',
6463
'parseTrueNumberOnly',
@@ -74,7 +73,6 @@ const getTraversalObj = function(xmlData, options) {
7473
const xmlObj = new xmlNode('!xml');
7574
let currentNode = xmlObj;
7675

77-
regx = regx.replace(/\[\\w/g, '[' + options.localeRange + '\\w');
7876
const tagsRegx = new RegExp(regx, 'g');
7977
let tag = tagsRegx.exec(xmlData);
8078
let nextTag = tagsRegx.exec(xmlData);
@@ -83,7 +81,7 @@ const getTraversalObj = function(xmlData, options) {
8381

8482
if (tagType === TagType.CLOSING) {
8583
//add parsed data to parent node
86-
if (currentNode.parent && tag[14]) {
84+
if (currentNode.parent && tag[12]) {
8785
currentNode.parent.val = util.getValue(currentNode.parent.val) + '' + processTagValue(tag, options, currentNode.parent.tagname);
8886
}
8987
if (options.stopNodes.length && options.stopNodes.includes(currentNode.tagname)) {
@@ -101,14 +99,14 @@ const getTraversalObj = function(xmlData, options) {
10199
//for backtracking
102100
currentNode.val = util.getValue(currentNode.val) + options.cdataPositionChar;
103101
//add rest value to parent node
104-
if (tag[14]) {
102+
if (tag[12]) {
105103
currentNode.val += processTagValue(tag, options);
106104
}
107105
} else {
108106
currentNode.val = (currentNode.val || '') + (tag[3] || '') + processTagValue(tag, options);
109107
}
110108
} else if (tagType === TagType.SELF) {
111-
if (currentNode && tag[14]) {
109+
if (currentNode && tag[12]) {
112110
currentNode.val = util.getValue(currentNode.val) + '' + processTagValue(tag, options);
113111
}
114112

@@ -142,7 +140,7 @@ const getTraversalObj = function(xmlData, options) {
142140

143141
function processTagValue(parsedTags, options, parentTagName) {
144142
const tagName = parsedTags[7] || parentTagName;
145-
let val = parsedTags[14];
143+
let val = parsedTags[12];
146144
if (val) {
147145
if (options.trimValues) {
148146
val = val.trim();

0 commit comments

Comments
 (0)