diff --git a/README.md b/README.md index 61b2055..d5e4399 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,25 @@ Parses URIs as described in [RFC-3986](https://tools.ietf.org/html/rfc3986). - `sub_delims` (pattern): the set of subcomponent delimeters +### `iri` + +Parses IRIs as described in [RFC-3987](https://tools.ietf.org/html/rfc3987). + +Very similar to the [uri](#uri) module, except allows utf8. + + - `IRI` (pattern): on success, returns a table with fields: (similar to [luasocket](http://w3.impa.br/~diego/software/luasocket/url.html#parse)) + - `scheme` + - `userinfo` + - `host` + - `port` + - `path` + - `query` + - `fragment` + - `absolute_IRI` (pattern): similar to `IRI`, but does not permit fragments + - `IRI_reference` (pattern): similar to `IRI`, but permits relative URIs + - `ipath` (pattern): matches the path portion of an IRI. Captures `nil` for the empty path. + + ### `email` - `mailbox` (pattern): the mailbox format: matches either `name_addr` or an addr-spec. diff --git a/lpeg_patterns/iri.lua b/lpeg_patterns/iri.lua new file mode 100644 index 0000000..2a2c961 --- /dev/null +++ b/lpeg_patterns/iri.lua @@ -0,0 +1,152 @@ +-- https://tools.ietf.org/html/rfc3987 + +local lpeg = require "lpeg" +local core = require "lpeg_patterns.core" +local uri = require "lpeg_patterns.uri" +local IPv4 = require "lpeg_patterns.IPv4" + +local Cc = lpeg.Cc +local Cg = lpeg.Cg +local Cs = lpeg.Cs +local Ct = lpeg.Ct +local Cmt = lpeg.Cmt +local P = lpeg.P +local R = lpeg.R +local S = lpeg.S + +local _M = {} + +local cont = R"\128\191" -- continuation byte +local utf8 = R"\0\127" / string.byte + + R"\194\223" * cont / function(s) + local c1, c2 = string.byte(s, 1, 2) + return c1 * 64 + c2 - 12416 + end + + R"\224\239" * cont * cont / function(s) + local c1, c2, c3 = string.byte(s, 1, 3) + return (c1 * 64 + c2) * 64 + c3 - 925824 + end + + R"\240\244" * cont * cont * cont / function(s) + local c1, c2, c3, c4 = string.byte(s, 1, 4) + return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 + end + +local ucschar = Cmt(utf8, function(_, i, codepoint) + local found + if codepoint <= 0xD7FF then + found = codepoint >= 0xA0 + elseif codepoint <= 0xFDCF then + found = codepoint >= 0xF900 + elseif codepoint <= 0xFFEF then + found = codepoint >= 0xFDF0 + elseif codepoint <= 0x1FFFD then + found = codepoint >= 0x10000 + elseif codepoint <= 0x2FFFD then + found = codepoint >= 0x20000 + elseif codepoint <= 0x3FFFD then + found = codepoint >= 0x30000 + elseif codepoint <= 0x4FFFD then + found = codepoint >= 0x40000 + elseif codepoint <= 0x5FFFD then + found = codepoint >= 0x50000 + elseif codepoint <= 0x6FFFD then + found = codepoint >= 0x60000 + elseif codepoint <= 0x7FFFD then + found = codepoint >= 0x70000 + elseif codepoint <= 0x8FFFD then + found = codepoint >= 0x80000 + elseif codepoint <= 0x9FFFD then + found = codepoint >= 0x90000 + elseif codepoint <= 0xAFFFD then + found = codepoint >= 0xA0000 + elseif codepoint <= 0xBFFFD then + found = codepoint >= 0xB0000 + elseif codepoint <= 0xCFFFD then + found = codepoint >= 0xC0000 + elseif codepoint <= 0xDFFFD then + found = codepoint >= 0xD0000 + elseif codepoint <= 0xEFFFD then + found = codepoint >= 0xE1000 + end + if found then + return true, i + else + return false + end +end) + +local iunreserved = core.ALPHA + core.DIGIT + S"-._~" + ucschar + +local iuserinfo = Cs((iunreserved + uri.pct_encoded + uri.sub_delims + P":")^0) + +-- TODO: Normalisation +local ireg_name = Cs(( + iunreserved + + uri.pct_encoded + + uri.sub_delims +)^1) + Cc(nil) +local ihost = (uri.IP_literal + IPv4.IPv4address) / tostring + ireg_name + +local ipchar = iunreserved + uri.pct_encoded + uri.sub_delims + S":@" +local isegment = ipchar^0 +local isegment_nz = ipchar^1 +local isegment_nz_nc = (ipchar - P":")^1 + +local ipath_empty = Cc(nil) -- an empty path is nil instead of the empty string +local ipath_abempty = Cs((P"/" * isegment)^1) + ipath_empty +local ipath_rootless = Cs(isegment_nz * (P"/" * isegment)^0) +local ipath_noscheme = Cs(isegment_nz_nc * (P"/" * isegment)^0) +local ipath_absolute = Cs(P"/" * (isegment_nz * (P"/" * isegment)^0)^-1) + +local iprivate = Cmt(utf8, function(_, i, codepoint) + local found + if codepoint <= 0xF8FF then + found = codepoint >= 0xE000 + elseif codepoint <= 0xFFFFD then + found = codepoint >= 0xF0000 + elseif codepoint <= 0x10FFFD then + found = codepoint >= 0x100000 + end + if found then + return true, i + else + return false + end +end) + +local iquery = Cs((ipchar + iprivate + S"/?")^0) + +local ifragment = Cs((ipchar + S"/?")^0) + +local iauthority = (Cg(iuserinfo, "userinfo") * P"@")^-1 + * Cg(ihost, "host") + * (P":" * Cg(uri.port, "port"))^-1 + +local ihier_part = P"//" * iauthority * Cg(ipath_abempty, "path") + + Cg(ipath_absolute + ipath_rootless + ipath_empty, "path") + +_M.absolute_IRI = Ct( + (Cg(uri.scheme, "scheme") * P":") + * ihier_part + * (P"?" * Cg(iquery, "query"))^-1 +) + +_M.IRI = Ct( + (Cg(uri.scheme, "scheme") * P":") + * ihier_part + * (P"?" * Cg(iquery, "query"))^-1 + * (P"#" * Cg(ifragment, "fragment"))^-1 +) + +local irelative_part = P"//" * iauthority * Cg(ipath_abempty, "path") + + Cg(ipath_absolute + ipath_noscheme + ipath_empty, "path") +local irelative_ref = Ct( + irelative_part + * (P"?" * Cg(iquery, "query"))^-1 + * (P"#" * Cg(ifragment, "fragment"))^-1 +) +_M.IRI_reference = _M.IRI + irelative_ref + +_M.ipath = ipath_abempty + ipath_absolute + ipath_noscheme + ipath_rootless + ipath_empty + +return _M diff --git a/spec/iri_spec.lua b/spec/iri_spec.lua new file mode 100644 index 0000000..e878cde --- /dev/null +++ b/spec/iri_spec.lua @@ -0,0 +1,150 @@ +local lpeg = require "lpeg" +describe("IRI", function() + local iri_lib = require "lpeg_patterns.iri" + local absolute_IRI = iri_lib.absolute_IRI * lpeg.P(-1) + local IRI = iri_lib.IRI * lpeg.P(-1) + local iref = iri_lib.IRI_reference * lpeg.P(-1) + local ipath = iri_lib.ipath * lpeg.P(-1) + it("Should break down full IRIs correctly", function() + assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query", fragment="fragment"}, + IRI:match "scheme://userinfo@host:1234/path?query#fragment") + assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query"}, + IRI:match "scheme://userinfo@host:1234/path?query") + assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path"}, + IRI:match "scheme://userinfo@host:1234/path") + assert.same({scheme="scheme", host="host", port=1234, path="/path"}, + IRI:match "scheme://host:1234/path") + assert.same({scheme="scheme", host="host", path="/path"}, + IRI:match "scheme://host/path") + assert.same({scheme="scheme", path="/path"}, + IRI:match "scheme:///path") + assert.same({scheme="scheme"}, + IRI:match "scheme://") + end) + it("Normalises to lower case scheme", function() + assert.same({scheme="scheme"}, IRI:match "Scheme://") + assert.same({scheme="scheme"}, IRI:match "SCHEME://") + end) + it("shouldn't allow fragments when using absolute_IRI", function() + assert.falsy(absolute_IRI:match "scheme://userinfo@host:1234/path?query#fragment") + assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query"}, + absolute_IRI:match "scheme://userinfo@host:1234/path?query") + end) + it("Should break down relative IRIs correctly", function() + assert.same({scheme="scheme", userinfo="userinfo", host="host", port=1234, path="/path", query="query", fragment="fragment"}, + iref:match "scheme://userinfo@host:1234/path?query#fragment") + assert.same({userinfo="userinfo", host="host", port=1234, path="/path", query="query", fragment="fragment"}, + iref:match "//userinfo@host:1234/path?query#fragment") + assert.same({host="host", port=1234, path="/path", query="query", fragment="fragment"}, + iref:match "//host:1234/path?query#fragment") + assert.same({host="host", path="/path", query="query", fragment="fragment"}, + iref:match "//host/path?query#fragment") + assert.same({path="/path", query="query", fragment="fragment"}, + iref:match "///path?query#fragment") + assert.same({path="/path", query="query", fragment="fragment"}, + iref:match "/path?query#fragment") + assert.same({path="/path", fragment="fragment"}, + iref:match "/path#fragment") + assert.same({path="/path"}, + iref:match "/path") + assert.same({}, + iref:match "") + assert.same({query="query"}, + iref:match "?query") + assert.same({fragment="fragment"}, + iref:match "#fragment") + end) + it("Should match file urls", function() + assert.same({scheme="file", path="/var/log/messages"}, IRI:match "file:///var/log/messages") + assert.same({scheme="file", path="/C:/Windows/"}, IRI:match "file:///C:/Windows/") + end) + it("Should decode unreserved percent characters path", function() + assert.same("/underscore_character", ipath:match "/underscore%5Fcharacter") + assert.same("/null%00byte", ipath:match "/null%00byte") + end +) it("Should fail on incorrect percent characters", function() + assert.falsy(ipath:match "/bad%x0percent") + assert.falsy(ipath:match "/%s") + end) + it("Should not introduce ambiguiuty by decoding percent encoded entities", function() + assert.same({query="query%26with&ersand"}, iref:match "?query%26with&ersand") + end) + it("Should decode unreserved percent characters in query and fragment", function() + assert.same({query="query%20with_escapes"}, iref:match "?query%20with%5Fescapes") + assert.same({fragment="fragment%20with_escapes"}, iref:match "#fragment%20with%5Fescapes") + end) + it("Should match localhost", function() + assert.same({host="localhost"}, iref:match "//localhost") + assert.same({host="localhost"}, iref:match "//LOCALHOST") + assert.same({host="localhost"}, iref:match "//l%4FcAlH%6fSt") + assert.same({host="localhost", port=8000}, iref:match "//localhost:8000") + assert.same({scheme="http", host="localhost", port=8000}, IRI:match "http://localhost:8000") + end) + it("Should work with IPv6", function() + assert.same({host="0:0:0:0:0:0:0:1"}, iref:match "//[::1]") + assert.same({host="0:0:0:0:0:0:0:1", port=80}, iref:match "//[::1]:80") + end) + it("IPvFuture", function() + assert.same({host="v4.2", port=80}, iref:match "//[v4.2]:80") + assert.same({host="v4.2", port=80}, iref:match "//[V4.2]:80") + end) + it("Should work with IPv6 zone local addresses", function() + assert.same({host="0:0:0:0:0:0:0:1%eth0"}, iref:match "//[::1%25eth0]") + end) + it("Relative IRI does not match authority when scheme is missing", function() + assert.same({path="example.com/"}, iref:match "example.com/") -- should end up in path + assert.same({scheme="scheme", host="example.com", path="/"}, iref:match "scheme://example.com/") + end) + it("Should work with mailto URIs", function() + assert.same({scheme="mailto", path="user@example.com"}, IRI:match "mailto:user@example.com") + assert.same({scheme="mailto", path="someone@example.com,someoneelse@example.com"}, + IRI:match "mailto:someone@example.com,someoneelse@example.com") + assert.same({scheme="mailto", path="user@example.com", query="subject=This%20is%20the%20subject&cc=someone_else@example.com&body=This%20is%20the%20body"}, + IRI:match "mailto:user@example.com?subject=This%20is%20the%20subject&cc=someone_else@example.com&body=This%20is%20the%20body") + + -- Examples from RFC-6068 + -- Section 6.1 + assert.same({scheme="mailto", path="chris@example.com"}, IRI:match "mailto:chris@example.com") + assert.same({scheme="mailto", path="infobot@example.com", query="subject=current-issue"}, + IRI:match "mailto:infobot@example.com?subject=current-issue") + assert.same({scheme="mailto", path="infobot@example.com", query="body=send%20current-issue"}, + IRI:match "mailto:infobot@example.com?body=send%20current-issue") + assert.same({scheme="mailto", path="infobot@example.com", query="body=send%20current-issue%0D%0Asend%20index"}, + IRI:match "mailto:infobot@example.com?body=send%20current-issue%0D%0Asend%20index") + assert.same({scheme="mailto", path="list@example.org", query="In-Reply-To=%3C3469A91.D10AF4C@example.com%3E"}, + IRI:match "mailto:list@example.org?In-Reply-To=%3C3469A91.D10AF4C@example.com%3E") + assert.same({scheme="mailto", path="majordomo@example.com", query="body=subscribe%20bamboo-l"}, + IRI:match "mailto:majordomo@example.com?body=subscribe%20bamboo-l") + assert.same({scheme="mailto", path="joe@example.com", query="cc=bob@example.com&body=hello"}, + IRI:match "mailto:joe@example.com?cc=bob@example.com&body=hello") + assert.same({scheme="mailto", path="gorby%25kremvax@example.com"}, IRI:match "mailto:gorby%25kremvax@example.com") + assert.same({scheme="mailto", path="unlikely%3Faddress@example.com", query="blat=foop"}, + IRI:match "mailto:unlikely%3Faddress@example.com?blat=foop") + assert.same({scheme="mailto", path="Mike%26family@example.org"}, IRI:match "mailto:Mike%26family@example.org") + -- Section 6.2 + assert.same({scheme="mailto", path=[[%22not%40me%22@example.org]]}, IRI:match "mailto:%22not%40me%22@example.org") + assert.same({scheme="mailto", path=[[%22oh%5C%5Cno%22@example.org]]}, IRI:match "mailto:%22oh%5C%5Cno%22@example.org") + assert.same({scheme="mailto", path=[[%22%5C%5C%5C%22it's%5C%20ugly%5C%5C%5C%22%22@example.org]]}, + IRI:match "mailto:%22%5C%5C%5C%22it's%5C%20ugly%5C%5C%5C%22%22@example.org") + end) + it("Should work with xmpp URIs", function() + -- Examples from RFC-5122 + assert.same({scheme="xmpp", path="node@example.com"}, IRI:match "xmpp:node@example.com") + assert.same({scheme="xmpp", userinfo="guest", host="example.com"}, IRI:match "xmpp://guest@example.com") + assert.same({scheme="xmpp", userinfo="guest", host="example.com", path="/support@example.com", query="message"}, + IRI:match "xmpp://guest@example.com/support@example.com?message") + assert.same({scheme="xmpp", path="support@example.com", query="message"}, IRI:match "xmpp:support@example.com?message") + + assert.same({scheme="xmpp", path="example-node@example.com"}, IRI:match "xmpp:example-node@example.com") + assert.same({scheme="xmpp", path="example-node@example.com/some-resource"}, IRI:match "xmpp:example-node@example.com/some-resource") + assert.same({scheme="xmpp", path="example.com"}, IRI:match "xmpp:example.com") + assert.same({scheme="xmpp", path="example-node@example.com", query="message"}, IRI:match "xmpp:example-node@example.com?message") + assert.same({scheme="xmpp", path="example-node@example.com", query="message;subject=Hello%20World"}, + IRI:match "xmpp:example-node@example.com?message;subject=Hello%20World") + assert.same({scheme="xmpp", path=[[nasty!%23$%25()*+,-.;=%3F%5B%5C%5D%5E_%60%7B%7C%7D~node@example.com]]}, + IRI:match "xmpp:nasty!%23$%25()*+,-.;=%3F%5B%5C%5D%5E_%60%7B%7C%7D~node@example.com") + assert.same({scheme="xmpp", path=[[node@example.com/repulsive%20!%23%22$%25&'()*+,-.%2F:;%3C=%3E%3F%40%5B%5C%5D%5E_%60%7B%7C%7D~resource]]}, + IRI:match [[xmpp:node@example.com/repulsive%20!%23%22$%25&'()*+,-.%2F:;%3C=%3E%3F%40%5B%5C%5D%5E_%60%7B%7C%7D~resource]]) + assert.same({scheme="xmpp", path="ji%C5%99i@%C4%8Dechy.example/v%20Praze"}, IRI:match "xmpp:ji%C5%99i@%C4%8Dechy.example/v%20Praze") + end) +end)