Skip to content

Commit 63fa5e0

Browse files
committed
lpeg_patterns/iri: Start work on an IRI module
1 parent 4837b6d commit 63fa5e0

File tree

3 files changed

+321
-0
lines changed

3 files changed

+321
-0
lines changed

README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,25 @@ Parses URIs as described in [RFC-3986](https://tools.ietf.org/html/rfc3986).
8888
- `sub_delims` (pattern): the set of subcomponent delimeters
8989

9090

91+
### `iri`
92+
93+
Parses IRIs as described in [RFC-3987](https://tools.ietf.org/html/rfc3987).
94+
95+
Very similar to the [uri](#uri) module, except allows utf8.
96+
97+
- `IRI` (pattern): on success, returns a table with fields: (similar to [luasocket](http://w3.impa.br/~diego/software/luasocket/url.html#parse))
98+
- `scheme`
99+
- `userinfo`
100+
- `host`
101+
- `port`
102+
- `path`
103+
- `query`
104+
- `fragment`
105+
- `absolute_IRI` (pattern): similar to `IRI`, but does not permit fragments
106+
- `IRI_reference` (pattern): similar to `IRI`, but permits relative URIs
107+
- `ipath` (pattern): matches the path portion of an IRI. Captures `nil` for the empty path.
108+
109+
91110
### `email`
92111

93112
- `mailbox` (pattern): the mailbox format: matches either `name_addr` or an addr-spec.

lpeg_patterns/iri.lua

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
-- https://tools.ietf.org/html/rfc3987
2+
3+
local lpeg = require "lpeg"
4+
local core = require "lpeg_patterns.core"
5+
local uri = require "lpeg_patterns.uri"
6+
local IPv4 = require "lpeg_patterns.IPv4"
7+
8+
local Cc = lpeg.Cc
9+
local Cg = lpeg.Cg
10+
local Cs = lpeg.Cs
11+
local Ct = lpeg.Ct
12+
local Cmt = lpeg.Cmt
13+
local P = lpeg.P
14+
local R = lpeg.R
15+
local S = lpeg.S
16+
17+
local _M = {}
18+
19+
local cont = R"\128\191" -- continuation byte
20+
local utf8 = R"\0\127" / string.byte
21+
+ R"\194\223" * cont / function(s)
22+
local c1, c2 = string.byte(s, 1, 2)
23+
return c1 * 64 + c2 - 12416
24+
end
25+
+ R"\224\239" * cont * cont / function(s)
26+
local c1, c2, c3 = string.byte(s, 1, 3)
27+
return (c1 * 64 + c2) * 64 + c3 - 925824
28+
end
29+
+ R"\240\244" * cont * cont * cont / function(s)
30+
local c1, c2, c3, c4 = string.byte(s, 1, 4)
31+
return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
32+
end
33+
34+
local ucschar = Cmt(utf8, function(_, i, codepoint)
35+
local found
36+
if codepoint <= 0xD7FF then
37+
found = codepoint >= 0xA0
38+
elseif codepoint <= 0xFDCF then
39+
found = codepoint >= 0xF900
40+
elseif codepoint <= 0xFFEF then
41+
found = codepoint >= 0xFDF0
42+
elseif codepoint <= 0x1FFFD then
43+
found = codepoint >= 0x10000
44+
elseif codepoint <= 0x2FFFD then
45+
found = codepoint >= 0x20000
46+
elseif codepoint <= 0x3FFFD then
47+
found = codepoint >= 0x30000
48+
elseif codepoint <= 0x4FFFD then
49+
found = codepoint >= 0x40000
50+
elseif codepoint <= 0x5FFFD then
51+
found = codepoint >= 0x50000
52+
elseif codepoint <= 0x6FFFD then
53+
found = codepoint >= 0x60000
54+
elseif codepoint <= 0x7FFFD then
55+
found = codepoint >= 0x70000
56+
elseif codepoint <= 0x8FFFD then
57+
found = codepoint >= 0x80000
58+
elseif codepoint <= 0x9FFFD then
59+
found = codepoint >= 0x90000
60+
elseif codepoint <= 0xAFFFD then
61+
found = codepoint >= 0xA0000
62+
elseif codepoint <= 0xBFFFD then
63+
found = codepoint >= 0xB0000
64+
elseif codepoint <= 0xCFFFD then
65+
found = codepoint >= 0xC0000
66+
elseif codepoint <= 0xDFFFD then
67+
found = codepoint >= 0xD0000
68+
elseif codepoint <= 0xEFFFD then
69+
found = codepoint >= 0xE1000
70+
end
71+
if found then
72+
return true, i
73+
else
74+
return false
75+
end
76+
end)
77+
78+
local iunreserved = core.ALPHA + core.DIGIT + S"-._~" + ucschar
79+
80+
local iuserinfo = Cs((iunreserved + uri.pct_encoded + uri.sub_delims + P":")^0)
81+
82+
-- TODO: Normalisation
83+
local ireg_name = Cs((
84+
iunreserved
85+
+ uri.pct_encoded
86+
+ uri.sub_delims
87+
)^1) + Cc(nil)
88+
local ihost = (uri.IP_literal + IPv4.IPv4address) / tostring + ireg_name
89+
90+
local ipchar = iunreserved + uri.pct_encoded + uri.sub_delims + S":@"
91+
local isegment = ipchar^0
92+
local isegment_nz = ipchar^1
93+
local isegment_nz_nc = (ipchar - P":")^1
94+
95+
local ipath_empty = Cc(nil) -- an empty path is nil instead of the empty string
96+
local ipath_abempty = Cs((P"/" * isegment)^1) + ipath_empty
97+
local ipath_rootless = Cs(isegment_nz * (P"/" * isegment)^0)
98+
local ipath_noscheme = Cs(isegment_nz_nc * (P"/" * isegment)^0)
99+
local ipath_absolute = Cs(P"/" * (isegment_nz * (P"/" * isegment)^0)^-1)
100+
101+
local iprivate = Cmt(utf8, function(_, i, codepoint)
102+
local found
103+
if codepoint <= 0xF8FF then
104+
found = codepoint >= 0xE000
105+
elseif codepoint <= 0xFFFFD then
106+
found = codepoint >= 0xF0000
107+
elseif codepoint <= 0x10FFFD then
108+
found = codepoint >= 0x100000
109+
end
110+
if found then
111+
return true, i
112+
else
113+
return false
114+
end
115+
end)
116+
117+
local iquery = Cs((ipchar + iprivate + S"/?")^0)
118+
119+
local ifragment = Cs((ipchar + S"/?")^0)
120+
121+
local iauthority = (Cg(iuserinfo, "userinfo") * P"@")^-1
122+
* Cg(ihost, "host")
123+
* (P":" * Cg(uri.port, "port"))^-1
124+
125+
local ihier_part = P"//" * iauthority * Cg(ipath_abempty, "path")
126+
+ Cg(ipath_absolute + ipath_rootless + ipath_empty, "path")
127+
128+
_M.absolute_IRI = Ct(
129+
(Cg(uri.scheme, "scheme") * P":")
130+
* ihier_part
131+
* (P"?" * Cg(iquery, "query"))^-1
132+
)
133+
134+
_M.IRI = Ct(
135+
(Cg(uri.scheme, "scheme") * P":")
136+
* ihier_part
137+
* (P"?" * Cg(iquery, "query"))^-1
138+
* (P"#" * Cg(ifragment, "fragment"))^-1
139+
)
140+
141+
local irelative_part = P"//" * iauthority * Cg(ipath_abempty, "path")
142+
+ Cg(ipath_absolute + ipath_noscheme + ipath_empty, "path")
143+
local irelative_ref = Ct(
144+
irelative_part
145+
* (P"?" * Cg(iquery, "query"))^-1
146+
* (P"#" * Cg(ifragment, "fragment"))^-1
147+
)
148+
_M.IRI_reference = _M.IRI + irelative_ref
149+
150+
_M.ipath = ipath_abempty + ipath_absolute + ipath_noscheme + ipath_rootless + ipath_empty
151+
152+
return _M

spec/iri_spec.lua

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
local lpeg = require "lpeg"
2+
describe("IRI", function()
3+
local iri_lib = require "lpeg_patterns.iri"
4+
local absolute_IRI = iri_lib.absolute_IRI * lpeg.P(-1)
5+
local IRI = iri_lib.IRI * lpeg.P(-1)
6+
local iref = iri_lib.IRI_reference * lpeg.P(-1)
7+
local ipath = iri_lib.ipath * lpeg.P(-1)
8+
it("Should break down full IRIs correctly", function()
9+
assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query", fragment="fragment"},
10+
IRI:match "scheme://userinfo@host:1234/path?query#fragment")
11+
assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query"},
12+
IRI:match "scheme://userinfo@host:1234/path?query")
13+
assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path"},
14+
IRI:match "scheme://userinfo@host:1234/path")
15+
assert.same({scheme="scheme", host="host", port=1234, path="/path"},
16+
IRI:match "scheme://host:1234/path")
17+
assert.same({scheme="scheme", host="host", path="/path"},
18+
IRI:match "scheme://host/path")
19+
assert.same({scheme="scheme", path="/path"},
20+
IRI:match "scheme:///path")
21+
assert.same({scheme="scheme"},
22+
IRI:match "scheme://")
23+
end)
24+
it("Normalises to lower case scheme", function()
25+
assert.same({scheme="scheme"}, IRI:match "Scheme://")
26+
assert.same({scheme="scheme"}, IRI:match "SCHEME://")
27+
end)
28+
it("shouldn't allow fragments when using absolute_IRI", function()
29+
assert.falsy(absolute_IRI:match "scheme://userinfo@host:1234/path?query#fragment")
30+
assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query"},
31+
absolute_IRI:match "scheme://userinfo@host:1234/path?query")
32+
end)
33+
it("Should break down relative IRIs correctly", function()
34+
assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query", fragment="fragment"},
35+
iref:match "scheme://userinfo@host:1234/path?query#fragment")
36+
assert.same({userinfo="userinfo", host="host", port=1234, path="/path", query="query", fragment="fragment"},
37+
iref:match "//userinfo@host:1234/path?query#fragment")
38+
assert.same({host="host", port=1234, path="/path", query="query", fragment="fragment"},
39+
iref:match "//host:1234/path?query#fragment")
40+
assert.same({host="host", path="/path", query="query", fragment="fragment"},
41+
iref:match "//host/path?query#fragment")
42+
assert.same({path="/path", query="query", fragment="fragment"},
43+
iref:match "///path?query#fragment")
44+
assert.same({path="/path", query="query", fragment="fragment"},
45+
iref:match "/path?query#fragment")
46+
assert.same({path="/path", fragment="fragment"},
47+
iref:match "/path#fragment")
48+
assert.same({path="/path"},
49+
iref:match "/path")
50+
assert.same({},
51+
iref:match "")
52+
assert.same({query="query"},
53+
iref:match "?query")
54+
assert.same({fragment="fragment"},
55+
iref:match "#fragment")
56+
end)
57+
it("Should match file urls", function()
58+
assert.same({scheme="file", path="/var/log/messages"}, IRI:match "file:///var/log/messages")
59+
assert.same({scheme="file", path="/C:/Windows/"}, IRI:match "file:///C:/Windows/")
60+
end)
61+
it("Should decode unreserved percent characters path", function()
62+
assert.same("/underscore_character", ipath:match "/underscore%5Fcharacter")
63+
assert.same("/null%00byte", ipath:match "/null%00byte")
64+
end
65+
) it("Should fail on incorrect percent characters", function()
66+
assert.falsy(ipath:match "/bad%x0percent")
67+
assert.falsy(ipath:match "/%s")
68+
end)
69+
it("Should not introduce ambiguiuty by decoding percent encoded entities", function()
70+
assert.same({query="query%26with&ampersand"}, iref:match "?query%26with&ampersand")
71+
end)
72+
it("Should decode unreserved percent characters in query and fragment", function()
73+
assert.same({query="query%20with_escapes"}, iref:match "?query%20with%5Fescapes")
74+
assert.same({fragment="fragment%20with_escapes"}, iref:match "#fragment%20with%5Fescapes")
75+
end)
76+
it("Should match localhost", function()
77+
assert.same({host="localhost"}, iref:match "//localhost")
78+
assert.same({host="localhost"}, iref:match "//LOCALHOST")
79+
assert.same({host="localhost"}, iref:match "//l%4FcAlH%6fSt")
80+
assert.same({host="localhost", port=8000}, iref:match "//localhost:8000")
81+
assert.same({scheme="http", host="localhost", port=8000}, IRI:match "http://localhost:8000")
82+
end)
83+
it("Should work with IPv6", function()
84+
assert.same({host="0:0:0:0:0:0:0:1"}, iref:match "//[::1]")
85+
assert.same({host="0:0:0:0:0:0:0:1", port=80}, iref:match "//[::1]:80")
86+
end)
87+
it("IPvFuture", function()
88+
assert.same({host="v4.2", port=80}, iref:match "//[v4.2]:80")
89+
assert.same({host="v4.2", port=80}, iref:match "//[V4.2]:80")
90+
end)
91+
it("Should work with IPv6 zone local addresses", function()
92+
assert.same({host="0:0:0:0:0:0:0:1%eth0"}, iref:match "//[::1%25eth0]")
93+
end)
94+
it("Relative IRI does not match authority when scheme is missing", function()
95+
assert.same({path="example.com/"}, iref:match "example.com/") -- should end up in path
96+
assert.same({scheme="scheme", host="example.com", path="/"}, iref:match "scheme://example.com/")
97+
end)
98+
it("Should work with mailto URIs", function()
99+
assert.same({scheme="mailto", path="user@example.com"}, IRI:match "mailto:user@example.com")
100+
assert.same({scheme="mailto", path="someone@example.com,someoneelse@example.com"},
101+
IRI:match "mailto:someone@example.com,someoneelse@example.com")
102+
assert.same({scheme="mailto", path="user@example.com", query="subject=This%20is%20the%20subject&cc=someone_else@example.com&body=This%20is%20the%20body"},
103+
IRI:match "mailto:user@example.com?subject=This%20is%20the%20subject&cc=someone_else@example.com&body=This%20is%20the%20body")
104+
105+
-- Examples from RFC-6068
106+
-- Section 6.1
107+
assert.same({scheme="mailto", path="chris@example.com"}, IRI:match "mailto:chris@example.com")
108+
assert.same({scheme="mailto", path="infobot@example.com", query="subject=current-issue"},
109+
IRI:match "mailto:infobot@example.com?subject=current-issue")
110+
assert.same({scheme="mailto", path="infobot@example.com", query="body=send%20current-issue"},
111+
IRI:match "mailto:infobot@example.com?body=send%20current-issue")
112+
assert.same({scheme="mailto", path="infobot@example.com", query="body=send%20current-issue%0D%0Asend%20index"},
113+
IRI:match "mailto:infobot@example.com?body=send%20current-issue%0D%0Asend%20index")
114+
assert.same({scheme="mailto", path="list@example.org", query="In-Reply-To=%3C3469A91.D10AF4C@example.com%3E"},
115+
IRI:match "mailto:list@example.org?In-Reply-To=%3C3469A91.D10AF4C@example.com%3E")
116+
assert.same({scheme="mailto", path="majordomo@example.com", query="body=subscribe%20bamboo-l"},
117+
IRI:match "mailto:majordomo@example.com?body=subscribe%20bamboo-l")
118+
assert.same({scheme="mailto", path="joe@example.com", query="cc=bob@example.com&body=hello"},
119+
IRI:match "mailto:joe@example.com?cc=bob@example.com&body=hello")
120+
assert.same({scheme="mailto", path="gorby%25kremvax@example.com"}, IRI:match "mailto:gorby%25kremvax@example.com")
121+
assert.same({scheme="mailto", path="unlikely%3Faddress@example.com", query="blat=foop"},
122+
IRI:match "mailto:unlikely%3Faddress@example.com?blat=foop")
123+
assert.same({scheme="mailto", path="Mike%26family@example.org"}, IRI:match "mailto:Mike%26family@example.org")
124+
-- Section 6.2
125+
assert.same({scheme="mailto", path=[[%22not%40me%22@example.org]]}, IRI:match "mailto:%22not%40me%22@example.org")
126+
assert.same({scheme="mailto", path=[[%22oh%5C%5Cno%22@example.org]]}, IRI:match "mailto:%22oh%5C%5Cno%22@example.org")
127+
assert.same({scheme="mailto", path=[[%22%5C%5C%5C%22it's%5C%20ugly%5C%5C%5C%22%22@example.org]]},
128+
IRI:match "mailto:%22%5C%5C%5C%22it's%5C%20ugly%5C%5C%5C%22%22@example.org")
129+
end)
130+
it("Should work with xmpp URIs", function()
131+
-- Examples from RFC-5122
132+
assert.same({scheme="xmpp", path="node@example.com"}, IRI:match "xmpp:node@example.com")
133+
assert.same({scheme="xmpp", userinfo="guest", host="example.com"}, IRI:match "xmpp://guest@example.com")
134+
assert.same({scheme="xmpp", userinfo="guest", host="example.com", path="/support@example.com", query="message"},
135+
IRI:match "xmpp://guest@example.com/support@example.com?message")
136+
assert.same({scheme="xmpp", path="support@example.com", query="message"}, IRI:match "xmpp:support@example.com?message")
137+
138+
assert.same({scheme="xmpp", path="example-node@example.com"}, IRI:match "xmpp:example-node@example.com")
139+
assert.same({scheme="xmpp", path="example-node@example.com/some-resource"}, IRI:match "xmpp:example-node@example.com/some-resource")
140+
assert.same({scheme="xmpp", path="example.com"}, IRI:match "xmpp:example.com")
141+
assert.same({scheme="xmpp", path="example-node@example.com", query="message"}, IRI:match "xmpp:example-node@example.com?message")
142+
assert.same({scheme="xmpp", path="example-node@example.com", query="message;subject=Hello%20World"},
143+
IRI:match "xmpp:example-node@example.com?message;subject=Hello%20World")
144+
assert.same({scheme="xmpp", path=[[nasty!%23$%25()*+,-.;=%3F%5B%5C%5D%5E_%60%7B%7C%7D~node@example.com]]},
145+
IRI:match "xmpp:nasty!%23$%25()*+,-.;=%3F%5B%5C%5D%5E_%60%7B%7C%7D~node@example.com")
146+
assert.same({scheme="xmpp", path=[[node@example.com/repulsive%20!%23%22$%25&'()*+,-.%2F:;%3C=%3E%3F%40%5B%5C%5D%5E_%60%7B%7C%7D~resource]]},
147+
IRI:match [[xmpp:node@example.com/repulsive%20!%23%22$%25&'()*+,-.%2F:;%3C=%3E%3F%40%5B%5C%5D%5E_%60%7B%7C%7D~resource]])
148+
assert.same({scheme="xmpp", path="ji%C5%99i@%C4%8Dechy.example/v%20Praze"}, IRI:match "xmpp:ji%C5%99i@%C4%8Dechy.example/v%20Praze")
149+
end)
150+
end)

0 commit comments

Comments
 (0)