Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ UNRELEASED

- http: split into multiple modules
- http.alpn: verify that protocol_id meets unique encoding criteria
- utf8: Add new utf8 module
- email: Allow internationalised email addresses (#15)


0.5 - 2018-07-15
Expand Down
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ Parses URIs as described in [RFC-3986](https://tools.ietf.org/html/rfc3986).
- `mailbox` (pattern): the mailbox format: matches either `name_addr` or an addr-spec.
- `name_addr` (pattern): the name and address format i.e. `Display Name<email@example.com>`
Has captures of the local_part and the domain. Captures the display name in the named capture `"display"`
- `email` (pattern): also known as an "addr-spec"; follows [RFC-5322 section 3.4.1](http://tools.ietf.org/html/rfc5322#section-3.4.1)
- `email` (pattern): also known as an "addr-spec"; follows [RFC-5322 section 3.4.1](http://tools.ietf.org/html/rfc5322#section-3.4.1) with the internationalisation extensions from [RFC-6532 section 3.1](http://tools.ietf.org/html/rfc6532#section-3.1)
Has captures of the local_part and the domain
Be careful trying to reconstruct the email address from the captures; you may need escaping
- `local_part` (pattern): the bit before the `@` in an email address
Expand Down Expand Up @@ -340,3 +340,15 @@ Patterns for definitions from [RFC-4646 Section 2.1](https://tools.ietf.org/html
- `privateuse` (optional): an array
- `privateuse` (pattern): captures an array
- `Language_Tag` (pattern): captures the whole language tag


### `utf8`

Patterns for matching utf8 sequences from [RFC 3629 Section 4](https://tools.ietf.org/html/rfc3629#section-4)

- `UTF8_1` (pattern): matches a 1-byte utf8 sequence
- `UTF8_2` (pattern): matches a 2-byte utf8 sequence
- `UTF8_3` (pattern): matches a 3-byte utf8 sequence
- `UTF8_4` (pattern): matches a 4-byte utf8 sequence
- `UTF8_char` (pattern): matches a single utf8 sequence
- `UTF8_octets` (pattern): matches multiple utf8 sequences in a row
1 change: 1 addition & 0 deletions lpeg_patterns-scm-0.rockspec
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,6 @@ build = {
["lpeg_patterns.http.websocket"] = "lpeg_patterns/http/websocket.lua";
["lpeg_patterns.phone"] = "lpeg_patterns/phone.lua";
["lpeg_patterns.language"] = "lpeg_patterns/language.lua";
["lpeg_patterns.utf8"] = "lpeg_patterns/utf8.lua";
};
}
14 changes: 9 additions & 5 deletions lpeg_patterns/email.lua
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,17 @@ local Cg = lpeg.Cg
local Ct = lpeg.Ct
local Cs = lpeg.Cs

-- From RFC 6532 Section 3.1
local utf8 = require "lpeg_patterns.utf8"
local UTF8_non_ascii = utf8.UTF8_2 + utf8.UTF8_3 + utf8.UTF8_4

local core = require "lpeg_patterns.core"
local CHAR = core.CHAR
local CRLF = core.CRLF
local CTL = core.CTL
local DQUOTE = core.DQUOTE
local WSP = core.WSP
local VCHAR = core.VCHAR
local VCHAR = core.VCHAR + UTF8_non_ascii

local obs_NO_WS_CTL = R("\1\8", "\11\12", "\14\31") + P"\127"

Expand All @@ -28,7 +32,7 @@ local quoted_pair = Cg(P"\\" * C(VCHAR + WSP)) + obs_qp
local FWS = (WSP^0 * CRLF)^-1 * WSP^1 / " " -- Fold whitespace into a single " "

-- Comments
local ctext = R"\33\39" + R"\42\91" + R"\93\126"
local ctext = R"\33\39" + R"\42\91" + R"\93\126" + UTF8_non_ascii
local comment = P {
V"comment" ;
ccontent = ctext + quoted_pair + V"comment" ;
Expand All @@ -38,13 +42,13 @@ local CFWS = ((FWS^-1 * comment)^1 * FWS^-1 + FWS ) / function() end

-- Atom
local specials = S[=[()<>@,;:\".[]]=]
local atext = CHAR-specials-P" "-CTL
local atext = CHAR-specials-P" "-CTL + UTF8_non_ascii
local atom = CFWS^-1 * C(atext^1) * CFWS^-1
local dot_atom_text = C(atext^1 * ( P"." * atext^1 )^0)
local dot_atom = CFWS^-1 * dot_atom_text * CFWS^-1

-- Quoted Strings
local qtext = S"\33"+R("\35\91","\93\126")
local qtext = S"\33"+R("\35\91","\93\126") + UTF8_non_ascii
local qcontent = qtext + quoted_pair
local quoted_string_text = DQUOTE * Cs((FWS^-1 * qcontent)^0 * FWS^-1) * DQUOTE
local quoted_string = CFWS^-1 * quoted_string_text * CFWS^-1
Expand All @@ -56,7 +60,7 @@ local phrase = obs_phrase -- obs_phrase is more broad than `word^1`, it's really

-- Addr-spec
local obs_dtext = obs_NO_WS_CTL + quoted_pair
local dtext = R("\33\90", "\94\126") + obs_dtext
local dtext = R("\33\90", "\94\126") + obs_dtext + UTF8_non_ascii
local domain_literal_text = P"[" * Cs((FWS^-1 * dtext)^0 * FWS^-1) * P"]"

local domain_text = dot_atom_text + domain_literal_text
Expand Down
28 changes: 28 additions & 0 deletions lpeg_patterns/utf8.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- RFC 3629 Section 4

local lpeg = require "lpeg"
local P = lpeg.P
local R = lpeg.R

local UTF8_tail = R("\128\191")
local UTF8_1 = R("\0\127")
local UTF8_2 = R("\194\223") * UTF8_tail
local UTF8_3 = P"\224" * R("\160\191") * UTF8_tail
+ R("\225\236") * UTF8_tail * UTF8_tail
+ P"\237" * R("\128\159") * UTF8_tail
+ R("\238\239") * UTF8_tail * UTF8_tail
local UTF8_4 = P"\240" * R("\144\191") * UTF8_tail * UTF8_tail
+ R("\241\243") * UTF8_tail * UTF8_tail * UTF8_tail
+ P"\244" * R("\128\143") * UTF8_tail * UTF8_tail

local UTF8_char = UTF8_1 + UTF8_2 + UTF8_3 + UTF8_4
local UTF8_octets = UTF8_char^0

return {
UTF8_1 = UTF8_1;
UTF8_2 = UTF8_2;
UTF8_3 = UTF8_3;
UTF8_4 = UTF8_4;
UTF8_char = UTF8_char;
UTF8_octets = UTF8_octets;
}
6 changes: 5 additions & 1 deletion spec/email_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ local EOF = lpeg.P(-1)

describe("email Addresses", function()
local email = lpeg.Ct(require "lpeg_patterns.email".email) * EOF
it("Pass valid addresses", function()
it("Parse valid addresses", function()
assert.same({"localpart", "example.com"}, email:match "localpart@example.com")
-- From https://twitter.com/errbufferoverfl/status/1019780300597891072
assert.same({"δοκιμή", "παράδειγμα.δοκιμή"}, email:match "δοκιμή@παράδειγμα.δοκιμή")
end)
it("Deny invalid addresses", function()
assert.falsy(email:match "not an address")
Expand Down Expand Up @@ -141,6 +143,8 @@ describe("mailbox", function()
local mailbox = lpeg.Ct(require "lpeg_patterns.email".mailbox) * EOF
it("matches an addr-spec", function()
assert.same({"foo", "example.com"}, mailbox:match "foo@example.com")
-- From https://github.com/hectane/hectane/issues/24#issuecomment-223486720
assert.same({"me", "example.com", display = "日本語 "}, mailbox:match "日本語 <me@example.com>")
end)
it("matches a name-addr", function()
assert.same({"foo", "example.com"}, mailbox:match "<foo@example.com>")
Expand Down
14 changes: 14 additions & 0 deletions spec/utf8_spec.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
describe("lpeg_patterns.http.alpn", function()
local utf8 = require "lpeg_patterns.utf8"
local lpeg = require "lpeg"
local EOF = lpeg.P(-1)
local UTF8_char = lpeg.C(utf8.UTF8_char) * EOF
it("works", function()
assert.same("f", UTF8_char:match("f"))
assert.same("日", UTF8_char:match("日"))
end)
it("must not match invalid sequences", function()
assert.same(nil, UTF8_char:match("\128"))
assert.same(nil, UTF8_char:match("\255"))
end)
end)