From eb78bd7f2df15d890a53bd4042420639cbfce55a Mon Sep 17 00:00:00 2001 From: daurnimator Date: Sun, 9 Sep 2018 22:02:17 -0700 Subject: [PATCH 1/2] lpeg_patterns/utf8: Add module --- NEWS | 1 + README.md | 12 ++++++++++++ lpeg_patterns-scm-0.rockspec | 1 + lpeg_patterns/utf8.lua | 28 ++++++++++++++++++++++++++++ spec/utf8_spec.lua | 14 ++++++++++++++ 5 files changed, 56 insertions(+) create mode 100644 lpeg_patterns/utf8.lua create mode 100644 spec/utf8_spec.lua diff --git a/NEWS b/NEWS index 5f6cd5d..7e06a09 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,7 @@ UNRELEASED - http: split into multiple modules - http.alpn: verify that protocol_id meets unique encoding criteria + - utf8: Add new utf8 module 0.5 - 2018-07-15 diff --git a/README.md b/README.md index 410e0d4..24eb027 100644 --- a/README.md +++ b/README.md @@ -340,3 +340,15 @@ Patterns for definitions from [RFC-4646 Section 2.1](https://tools.ietf.org/html - `privateuse` (optional): an array - `privateuse` (pattern): captures an array - `Language_Tag` (pattern): captures the whole language tag + + +### `utf8` + +Patterns for matching utf8 sequences from [RFC 3629 Section 4](https://tools.ietf.org/html/rfc3629#section-4) + + - `UTF8_1` (pattern): matches a 1-byte utf8 sequence + - `UTF8_2` (pattern): matches a 2-byte utf8 sequence + - `UTF8_3` (pattern): matches a 3-byte utf8 sequence + - `UTF8_4` (pattern): matches a 4-byte utf8 sequence + - `UTF8_char` (pattern): matches a single utf8 sequence + - `UTF8_octets` (pattern): matches multiple utf8 sequences in a row diff --git a/lpeg_patterns-scm-0.rockspec b/lpeg_patterns-scm-0.rockspec index a68995e..62f806e 100644 --- a/lpeg_patterns-scm-0.rockspec +++ b/lpeg_patterns-scm-0.rockspec @@ -51,5 +51,6 @@ build = { ["lpeg_patterns.http.websocket"] = "lpeg_patterns/http/websocket.lua"; ["lpeg_patterns.phone"] = "lpeg_patterns/phone.lua"; ["lpeg_patterns.language"] = "lpeg_patterns/language.lua"; + ["lpeg_patterns.utf8"] = "lpeg_patterns/utf8.lua"; }; } diff --git a/lpeg_patterns/utf8.lua b/lpeg_patterns/utf8.lua new file mode 100644 index 0000000..38de7f1 --- /dev/null +++ b/lpeg_patterns/utf8.lua @@ -0,0 +1,28 @@ +-- RFC 3629 Section 4 + +local lpeg = require "lpeg" +local P = lpeg.P +local R = lpeg.R + +local UTF8_tail = R("\128\191") +local UTF8_1 = R("\0\127") +local UTF8_2 = R("\194\223") * UTF8_tail +local UTF8_3 = P"\224" * R("\160\191") * UTF8_tail + + R("\225\236") * UTF8_tail * UTF8_tail + + P"\237" * R("\128\159") * UTF8_tail + + R("\238\239") * UTF8_tail * UTF8_tail +local UTF8_4 = P"\240" * R("\144\191") * UTF8_tail * UTF8_tail + + R("\241\243") * UTF8_tail * UTF8_tail * UTF8_tail + + P"\244" * R("\128\143") * UTF8_tail * UTF8_tail + +local UTF8_char = UTF8_1 + UTF8_2 + UTF8_3 + UTF8_4 +local UTF8_octets = UTF8_char^0 + +return { + UTF8_1 = UTF8_1; + UTF8_2 = UTF8_2; + UTF8_3 = UTF8_3; + UTF8_4 = UTF8_4; + UTF8_char = UTF8_char; + UTF8_octets = UTF8_octets; +} diff --git a/spec/utf8_spec.lua b/spec/utf8_spec.lua new file mode 100644 index 0000000..7cdbe9a --- /dev/null +++ b/spec/utf8_spec.lua @@ -0,0 +1,14 @@ +describe("lpeg_patterns.http.alpn", function() + local utf8 = require "lpeg_patterns.utf8" + local lpeg = require "lpeg" + local EOF = lpeg.P(-1) + local UTF8_char = lpeg.C(utf8.UTF8_char) * EOF + it("works", function() + assert.same("f", UTF8_char:match("f")) + assert.same("日", UTF8_char:match("日")) + end) + it("must not match invalid sequences", function() + assert.same(nil, UTF8_char:match("\128")) + assert.same(nil, UTF8_char:match("\255")) + end) +end) From 43d0b0083e24e15dab86b6639316c0bd64646905 Mon Sep 17 00:00:00 2001 From: daurnimator Date: Sun, 9 Sep 2018 22:05:49 -0700 Subject: [PATCH 2/2] lpeg_patterns/email: Allow international email addresses (RFC 6532) --- NEWS | 1 + README.md | 2 +- lpeg_patterns/email.lua | 14 +++++++++----- spec/email_spec.lua | 6 +++++- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index 7e06a09..af9bcfa 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,7 @@ UNRELEASED - http: split into multiple modules - http.alpn: verify that protocol_id meets unique encoding criteria - utf8: Add new utf8 module + - email: Allow internationalised email addresses (#15) 0.5 - 2018-07-15 diff --git a/README.md b/README.md index 24eb027..eb01fd8 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ Parses URIs as described in [RFC-3986](https://tools.ietf.org/html/rfc3986). - `mailbox` (pattern): the mailbox format: matches either `name_addr` or an addr-spec. - `name_addr` (pattern): the name and address format i.e. `Display Name` Has captures of the local_part and the domain. Captures the display name in the named capture `"display"` - - `email` (pattern): also known as an "addr-spec"; follows [RFC-5322 section 3.4.1](http://tools.ietf.org/html/rfc5322#section-3.4.1) + - `email` (pattern): also known as an "addr-spec"; follows [RFC-5322 section 3.4.1](http://tools.ietf.org/html/rfc5322#section-3.4.1) with the internationalisation extensions from [RFC-6532 section 3.1](http://tools.ietf.org/html/rfc6532#section-3.1) Has captures of the local_part and the domain Be careful trying to reconstruct the email address from the captures; you may need escaping - `local_part` (pattern): the bit before the `@` in an email address diff --git a/lpeg_patterns/email.lua b/lpeg_patterns/email.lua index 5d9fbe0..fd1ce4c 100644 --- a/lpeg_patterns/email.lua +++ b/lpeg_patterns/email.lua @@ -11,13 +11,17 @@ local Cg = lpeg.Cg local Ct = lpeg.Ct local Cs = lpeg.Cs +-- From RFC 6532 Section 3.1 +local utf8 = require "lpeg_patterns.utf8" +local UTF8_non_ascii = utf8.UTF8_2 + utf8.UTF8_3 + utf8.UTF8_4 + local core = require "lpeg_patterns.core" local CHAR = core.CHAR local CRLF = core.CRLF local CTL = core.CTL local DQUOTE = core.DQUOTE local WSP = core.WSP -local VCHAR = core.VCHAR +local VCHAR = core.VCHAR + UTF8_non_ascii local obs_NO_WS_CTL = R("\1\8", "\11\12", "\14\31") + P"\127" @@ -28,7 +32,7 @@ local quoted_pair = Cg(P"\\" * C(VCHAR + WSP)) + obs_qp local FWS = (WSP^0 * CRLF)^-1 * WSP^1 / " " -- Fold whitespace into a single " " -- Comments -local ctext = R"\33\39" + R"\42\91" + R"\93\126" +local ctext = R"\33\39" + R"\42\91" + R"\93\126" + UTF8_non_ascii local comment = P { V"comment" ; ccontent = ctext + quoted_pair + V"comment" ; @@ -38,13 +42,13 @@ local CFWS = ((FWS^-1 * comment)^1 * FWS^-1 + FWS ) / function() end -- Atom local specials = S[=[()<>@,;:\".[]]=] -local atext = CHAR-specials-P" "-CTL +local atext = CHAR-specials-P" "-CTL + UTF8_non_ascii local atom = CFWS^-1 * C(atext^1) * CFWS^-1 local dot_atom_text = C(atext^1 * ( P"." * atext^1 )^0) local dot_atom = CFWS^-1 * dot_atom_text * CFWS^-1 -- Quoted Strings -local qtext = S"\33"+R("\35\91","\93\126") +local qtext = S"\33"+R("\35\91","\93\126") + UTF8_non_ascii local qcontent = qtext + quoted_pair local quoted_string_text = DQUOTE * Cs((FWS^-1 * qcontent)^0 * FWS^-1) * DQUOTE local quoted_string = CFWS^-1 * quoted_string_text * CFWS^-1 @@ -56,7 +60,7 @@ local phrase = obs_phrase -- obs_phrase is more broad than `word^1`, it's really -- Addr-spec local obs_dtext = obs_NO_WS_CTL + quoted_pair -local dtext = R("\33\90", "\94\126") + obs_dtext +local dtext = R("\33\90", "\94\126") + obs_dtext + UTF8_non_ascii local domain_literal_text = P"[" * Cs((FWS^-1 * dtext)^0 * FWS^-1) * P"]" local domain_text = dot_atom_text + domain_literal_text diff --git a/spec/email_spec.lua b/spec/email_spec.lua index 4d2943d..73389be 100644 --- a/spec/email_spec.lua +++ b/spec/email_spec.lua @@ -3,8 +3,10 @@ local EOF = lpeg.P(-1) describe("email Addresses", function() local email = lpeg.Ct(require "lpeg_patterns.email".email) * EOF - it("Pass valid addresses", function() + it("Parse valid addresses", function() assert.same({"localpart", "example.com"}, email:match "localpart@example.com") + -- From https://twitter.com/errbufferoverfl/status/1019780300597891072 + assert.same({"δοκιμή", "παράδειγμα.δοκιμή"}, email:match "δοκιμή@παράδειγμα.δοκιμή") end) it("Deny invalid addresses", function() assert.falsy(email:match "not an address") @@ -141,6 +143,8 @@ describe("mailbox", function() local mailbox = lpeg.Ct(require "lpeg_patterns.email".mailbox) * EOF it("matches an addr-spec", function() assert.same({"foo", "example.com"}, mailbox:match "foo@example.com") + -- From https://github.com/hectane/hectane/issues/24#issuecomment-223486720 + assert.same({"me", "example.com", display = "日本語 "}, mailbox:match "日本語 ") end) it("matches a name-addr", function() assert.same({"foo", "example.com"}, mailbox:match "")