Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,25 @@ Parses URIs as described in [RFC-3986](https://tools.ietf.org/html/rfc3986).
- `sub_delims` (pattern): the set of subcomponent delimeters


### `iri`

Parses IRIs as described in [RFC-3987](https://tools.ietf.org/html/rfc3987).

Very similar to the [uri](#uri) module, except allows utf8.

- `IRI` (pattern): on success, returns a table with fields: (similar to [luasocket](http://w3.impa.br/~diego/software/luasocket/url.html#parse))
- `scheme`
- `userinfo`
- `host`
- `port`
- `path`
- `query`
- `fragment`
- `absolute_IRI` (pattern): similar to `IRI`, but does not permit fragments
- `IRI_reference` (pattern): similar to `IRI`, but permits relative URIs
- `ipath` (pattern): matches the path portion of an IRI. Captures `nil` for the empty path.


### `email`

- `mailbox` (pattern): the mailbox format: matches either `name_addr` or an addr-spec.
Expand Down
152 changes: 152 additions & 0 deletions lpeg_patterns/iri.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
-- https://tools.ietf.org/html/rfc3987

local lpeg = require "lpeg"
local core = require "lpeg_patterns.core"
local uri = require "lpeg_patterns.uri"
local IPv4 = require "lpeg_patterns.IPv4"

local Cc = lpeg.Cc
local Cg = lpeg.Cg
local Cs = lpeg.Cs
local Ct = lpeg.Ct
local Cmt = lpeg.Cmt
local P = lpeg.P
local R = lpeg.R
local S = lpeg.S

local _M = {}

local cont = R"\128\191" -- continuation byte
local utf8 = R"\0\127" / string.byte
+ R"\194\223" * cont / function(s)
local c1, c2 = string.byte(s, 1, 2)
return c1 * 64 + c2 - 12416
end
+ R"\224\239" * cont * cont / function(s)
local c1, c2, c3 = string.byte(s, 1, 3)
return (c1 * 64 + c2) * 64 + c3 - 925824
end
+ R"\240\244" * cont * cont * cont / function(s)
local c1, c2, c3, c4 = string.byte(s, 1, 4)
return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
end

local ucschar = Cmt(utf8, function(_, i, codepoint)
local found
if codepoint <= 0xD7FF then
found = codepoint >= 0xA0
elseif codepoint <= 0xFDCF then
found = codepoint >= 0xF900
elseif codepoint <= 0xFFEF then
found = codepoint >= 0xFDF0
elseif codepoint <= 0x1FFFD then
found = codepoint >= 0x10000
elseif codepoint <= 0x2FFFD then
found = codepoint >= 0x20000
elseif codepoint <= 0x3FFFD then
found = codepoint >= 0x30000
elseif codepoint <= 0x4FFFD then
found = codepoint >= 0x40000
elseif codepoint <= 0x5FFFD then
found = codepoint >= 0x50000
elseif codepoint <= 0x6FFFD then
found = codepoint >= 0x60000
elseif codepoint <= 0x7FFFD then
found = codepoint >= 0x70000
elseif codepoint <= 0x8FFFD then
found = codepoint >= 0x80000
elseif codepoint <= 0x9FFFD then
found = codepoint >= 0x90000
elseif codepoint <= 0xAFFFD then
found = codepoint >= 0xA0000
elseif codepoint <= 0xBFFFD then
found = codepoint >= 0xB0000
elseif codepoint <= 0xCFFFD then
found = codepoint >= 0xC0000
elseif codepoint <= 0xDFFFD then
found = codepoint >= 0xD0000
elseif codepoint <= 0xEFFFD then
found = codepoint >= 0xE1000
end
if found then
return true, i
else
return false
end
end)

local iunreserved = core.ALPHA + core.DIGIT + S"-._~" + ucschar

local iuserinfo = Cs((iunreserved + uri.pct_encoded + uri.sub_delims + P":")^0)

-- TODO: Normalisation
local ireg_name = Cs((
iunreserved
+ uri.pct_encoded
+ uri.sub_delims
)^1) + Cc(nil)
local ihost = (uri.IP_literal + IPv4.IPv4address) / tostring + ireg_name

local ipchar = iunreserved + uri.pct_encoded + uri.sub_delims + S":@"
local isegment = ipchar^0
local isegment_nz = ipchar^1
local isegment_nz_nc = (ipchar - P":")^1

local ipath_empty = Cc(nil) -- an empty path is nil instead of the empty string
local ipath_abempty = Cs((P"/" * isegment)^1) + ipath_empty
local ipath_rootless = Cs(isegment_nz * (P"/" * isegment)^0)
local ipath_noscheme = Cs(isegment_nz_nc * (P"/" * isegment)^0)
local ipath_absolute = Cs(P"/" * (isegment_nz * (P"/" * isegment)^0)^-1)

local iprivate = Cmt(utf8, function(_, i, codepoint)
local found
if codepoint <= 0xF8FF then
found = codepoint >= 0xE000
elseif codepoint <= 0xFFFFD then
found = codepoint >= 0xF0000
elseif codepoint <= 0x10FFFD then
found = codepoint >= 0x100000
end
if found then
return true, i
else
return false
end
end)

local iquery = Cs((ipchar + iprivate + S"/?")^0)

local ifragment = Cs((ipchar + S"/?")^0)

local iauthority = (Cg(iuserinfo, "userinfo") * P"@")^-1
* Cg(ihost, "host")
* (P":" * Cg(uri.port, "port"))^-1

local ihier_part = P"//" * iauthority * Cg(ipath_abempty, "path")
+ Cg(ipath_absolute + ipath_rootless + ipath_empty, "path")

_M.absolute_IRI = Ct(
(Cg(uri.scheme, "scheme") * P":")
* ihier_part
* (P"?" * Cg(iquery, "query"))^-1
)

_M.IRI = Ct(
(Cg(uri.scheme, "scheme") * P":")
* ihier_part
* (P"?" * Cg(iquery, "query"))^-1
* (P"#" * Cg(ifragment, "fragment"))^-1
)

local irelative_part = P"//" * iauthority * Cg(ipath_abempty, "path")
+ Cg(ipath_absolute + ipath_noscheme + ipath_empty, "path")
local irelative_ref = Ct(
irelative_part
* (P"?" * Cg(iquery, "query"))^-1
* (P"#" * Cg(ifragment, "fragment"))^-1
)
_M.IRI_reference = _M.IRI + irelative_ref

_M.ipath = ipath_abempty + ipath_absolute + ipath_noscheme + ipath_rootless + ipath_empty

return _M
150 changes: 150 additions & 0 deletions spec/iri_spec.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
local lpeg = require "lpeg"
describe("IRI", function()
local iri_lib = require "lpeg_patterns.iri"
local absolute_IRI = iri_lib.absolute_IRI * lpeg.P(-1)
local IRI = iri_lib.IRI * lpeg.P(-1)
local iref = iri_lib.IRI_reference * lpeg.P(-1)
local ipath = iri_lib.ipath * lpeg.P(-1)
it("Should break down full IRIs correctly", function()
assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query", fragment="fragment"},
IRI:match "scheme://userinfo@host:1234/path?query#fragment")
assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query"},
IRI:match "scheme://userinfo@host:1234/path?query")
assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path"},
IRI:match "scheme://userinfo@host:1234/path")
assert.same({scheme="scheme", host="host", port=1234, path="/path"},
IRI:match "scheme://host:1234/path")
assert.same({scheme="scheme", host="host", path="/path"},
IRI:match "scheme://host/path")
assert.same({scheme="scheme", path="/path"},
IRI:match "scheme:///path")
assert.same({scheme="scheme"},
IRI:match "scheme://")
end)
it("Normalises to lower case scheme", function()
assert.same({scheme="scheme"}, IRI:match "Scheme://")
assert.same({scheme="scheme"}, IRI:match "SCHEME://")
end)
it("shouldn't allow fragments when using absolute_IRI", function()
assert.falsy(absolute_IRI:match "scheme://userinfo@host:1234/path?query#fragment")
assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query"},
absolute_IRI:match "scheme://userinfo@host:1234/path?query")
end)
it("Should break down relative IRIs correctly", function()
assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query", fragment="fragment"},
iref:match "scheme://userinfo@host:1234/path?query#fragment")
assert.same({userinfo="userinfo", host="host", port=1234, path="/path", query="query", fragment="fragment"},
iref:match "//userinfo@host:1234/path?query#fragment")
assert.same({host="host", port=1234, path="/path", query="query", fragment="fragment"},
iref:match "//host:1234/path?query#fragment")
assert.same({host="host", path="/path", query="query", fragment="fragment"},
iref:match "//host/path?query#fragment")
assert.same({path="/path", query="query", fragment="fragment"},
iref:match "///path?query#fragment")
assert.same({path="/path", query="query", fragment="fragment"},
iref:match "/path?query#fragment")
assert.same({path="/path", fragment="fragment"},
iref:match "/path#fragment")
assert.same({path="/path"},
iref:match "/path")
assert.same({},
iref:match "")
assert.same({query="query"},
iref:match "?query")
assert.same({fragment="fragment"},
iref:match "#fragment")
end)
it("Should match file urls", function()
assert.same({scheme="file", path="/var/log/messages"}, IRI:match "file:///var/log/messages")
assert.same({scheme="file", path="/C:/Windows/"}, IRI:match "file:///C:/Windows/")
end)
it("Should decode unreserved percent characters path", function()
assert.same("/underscore_character", ipath:match "/underscore%5Fcharacter")
assert.same("/null%00byte", ipath:match "/null%00byte")
end
) it("Should fail on incorrect percent characters", function()
assert.falsy(ipath:match "/bad%x0percent")
assert.falsy(ipath:match "/%s")
end)
it("Should not introduce ambiguiuty by decoding percent encoded entities", function()
assert.same({query="query%26with&ampersand"}, iref:match "?query%26with&ampersand")
end)
it("Should decode unreserved percent characters in query and fragment", function()
assert.same({query="query%20with_escapes"}, iref:match "?query%20with%5Fescapes")
assert.same({fragment="fragment%20with_escapes"}, iref:match "#fragment%20with%5Fescapes")
end)
it("Should match localhost", function()
assert.same({host="localhost"}, iref:match "//localhost")
assert.same({host="localhost"}, iref:match "//LOCALHOST")
assert.same({host="localhost"}, iref:match "//l%4FcAlH%6fSt")
assert.same({host="localhost", port=8000}, iref:match "//localhost:8000")
assert.same({scheme="http", host="localhost", port=8000}, IRI:match "http://localhost:8000")
end)
it("Should work with IPv6", function()
assert.same({host="0:0:0:0:0:0:0:1"}, iref:match "//[::1]")
assert.same({host="0:0:0:0:0:0:0:1", port=80}, iref:match "//[::1]:80")
end)
it("IPvFuture", function()
assert.same({host="v4.2", port=80}, iref:match "//[v4.2]:80")
assert.same({host="v4.2", port=80}, iref:match "//[V4.2]:80")
end)
it("Should work with IPv6 zone local addresses", function()
assert.same({host="0:0:0:0:0:0:0:1%eth0"}, iref:match "//[::1%25eth0]")
end)
it("Relative IRI does not match authority when scheme is missing", function()
assert.same({path="example.com/"}, iref:match "example.com/") -- should end up in path
assert.same({scheme="scheme", host="example.com", path="/"}, iref:match "scheme://example.com/")
end)
it("Should work with mailto URIs", function()
assert.same({scheme="mailto", path="user@example.com"}, IRI:match "mailto:user@example.com")
assert.same({scheme="mailto", path="someone@example.com,someoneelse@example.com"},
IRI:match "mailto:someone@example.com,someoneelse@example.com")
assert.same({scheme="mailto", path="user@example.com", query="subject=This%20is%20the%20subject&cc=someone_else@example.com&body=This%20is%20the%20body"},
IRI:match "mailto:user@example.com?subject=This%20is%20the%20subject&cc=someone_else@example.com&body=This%20is%20the%20body")

-- Examples from RFC-6068
-- Section 6.1
assert.same({scheme="mailto", path="chris@example.com"}, IRI:match "mailto:chris@example.com")
assert.same({scheme="mailto", path="infobot@example.com", query="subject=current-issue"},
IRI:match "mailto:infobot@example.com?subject=current-issue")
assert.same({scheme="mailto", path="infobot@example.com", query="body=send%20current-issue"},
IRI:match "mailto:infobot@example.com?body=send%20current-issue")
assert.same({scheme="mailto", path="infobot@example.com", query="body=send%20current-issue%0D%0Asend%20index"},
IRI:match "mailto:infobot@example.com?body=send%20current-issue%0D%0Asend%20index")
assert.same({scheme="mailto", path="list@example.org", query="In-Reply-To=%3C3469A91.D10AF4C@example.com%3E"},
IRI:match "mailto:list@example.org?In-Reply-To=%3C3469A91.D10AF4C@example.com%3E")
assert.same({scheme="mailto", path="majordomo@example.com", query="body=subscribe%20bamboo-l"},
IRI:match "mailto:majordomo@example.com?body=subscribe%20bamboo-l")
assert.same({scheme="mailto", path="joe@example.com", query="cc=bob@example.com&body=hello"},
IRI:match "mailto:joe@example.com?cc=bob@example.com&body=hello")
assert.same({scheme="mailto", path="gorby%25kremvax@example.com"}, IRI:match "mailto:gorby%25kremvax@example.com")
assert.same({scheme="mailto", path="unlikely%3Faddress@example.com", query="blat=foop"},
IRI:match "mailto:unlikely%3Faddress@example.com?blat=foop")
assert.same({scheme="mailto", path="Mike%26family@example.org"}, IRI:match "mailto:Mike%26family@example.org")
-- Section 6.2
assert.same({scheme="mailto", path=[[%22not%40me%22@example.org]]}, IRI:match "mailto:%22not%40me%22@example.org")
assert.same({scheme="mailto", path=[[%22oh%5C%5Cno%22@example.org]]}, IRI:match "mailto:%22oh%5C%5Cno%22@example.org")
assert.same({scheme="mailto", path=[[%22%5C%5C%5C%22it's%5C%20ugly%5C%5C%5C%22%22@example.org]]},
IRI:match "mailto:%22%5C%5C%5C%22it's%5C%20ugly%5C%5C%5C%22%22@example.org")
end)
it("Should work with xmpp URIs", function()
-- Examples from RFC-5122
assert.same({scheme="xmpp", path="node@example.com"}, IRI:match "xmpp:node@example.com")
assert.same({scheme="xmpp", userinfo="guest", host="example.com"}, IRI:match "xmpp://guest@example.com")
assert.same({scheme="xmpp", userinfo="guest", host="example.com", path="/support@example.com", query="message"},
IRI:match "xmpp://guest@example.com/support@example.com?message")
assert.same({scheme="xmpp", path="support@example.com", query="message"}, IRI:match "xmpp:support@example.com?message")

assert.same({scheme="xmpp", path="example-node@example.com"}, IRI:match "xmpp:example-node@example.com")
assert.same({scheme="xmpp", path="example-node@example.com/some-resource"}, IRI:match "xmpp:example-node@example.com/some-resource")
assert.same({scheme="xmpp", path="example.com"}, IRI:match "xmpp:example.com")
assert.same({scheme="xmpp", path="example-node@example.com", query="message"}, IRI:match "xmpp:example-node@example.com?message")
assert.same({scheme="xmpp", path="example-node@example.com", query="message;subject=Hello%20World"},
IRI:match "xmpp:example-node@example.com?message;subject=Hello%20World")
assert.same({scheme="xmpp", path=[[nasty!%23$%25()*+,-.;=%3F%5B%5C%5D%5E_%60%7B%7C%7D~node@example.com]]},
IRI:match "xmpp:nasty!%23$%25()*+,-.;=%3F%5B%5C%5D%5E_%60%7B%7C%7D~node@example.com")
assert.same({scheme="xmpp", path=[[node@example.com/repulsive%20!%23%22$%25&'()*+,-.%2F:;%3C=%3E%3F%40%5B%5C%5D%5E_%60%7B%7C%7D~resource]]},
IRI:match [[xmpp:node@example.com/repulsive%20!%23%22$%25&'()*+,-.%2F:;%3C=%3E%3F%40%5B%5C%5D%5E_%60%7B%7C%7D~resource]])
assert.same({scheme="xmpp", path="ji%C5%99i@%C4%8Dechy.example/v%20Praze"}, IRI:match "xmpp:ji%C5%99i@%C4%8Dechy.example/v%20Praze")
end)
end)