Skip to content

Commit 2b13042

Browse files
Redaction Toolkit | Support to redact some Latin ligature letters and letters with diacritics (#1012)
## Purpose We currently cannot handle accented letter like é and ligature letter like œ, which is common in French. Need to support this. We also expand the language coverage out of just French. Please check the code for the actual charset we have taken care of. ## Validation Before merging this PR, please make sure below works are done and marked items with 'x'. - [x] Your code builds clean without any errors or warnings. - [x] You have tested your change manually. - [x] You have added unit tests. Co-authored-by: Chia-Sheng Chen <chiache@microsoft.com>
1 parent ccb55b0 commit 2b13042

File tree

4 files changed

+68
-4
lines changed

4 files changed

+68
-4
lines changed

scripts/redact_cli_py/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88

9+
## [0.2.3] - 2021-12-13
10+
### Added
11+
- Support to redact some Latin ligature letters and letters with diacritics.
12+
913
## [0.2.2] - 2021-11-17
1014
### Added
1115
- Support to only redact specific labels.

scripts/redact_cli_py/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ The OCR.json and labels.json will also be redacted while keeping the semantics o
1111
![labels-before-after-redaction](./images/labels-before-after-redaction.png)
1212

1313
## Version
14-
Redact CLI 0.2.2
14+
Redact CLI 0.2.3
1515

1616
## Setup Environment
1717

scripts/redact_cli_py/redact/utils/redact_policy.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,37 @@
33
# root for license information.
44

55
import re
6+
import unicodedata
67

78

89
def first_char(item: str) -> str:
910
# This replace every uppercase to 'A', lowercase to 'a', digit to '0'.
1011
# As known as the "Aa0" policy.
11-
ret = re.sub('[A-Z]', 'A', item)
12-
ret = re.sub('[a-z]', 'a', ret)
12+
13+
# First remove all diacritics and break typographical ligatures.
14+
ret = remove_diacritics(item)
15+
16+
# This also takes care of other common letter in Europe languages (Ø) and
17+
# linguistic ligatures (Œ) instead of just A-Z.
18+
ret = re.sub('[A-ZØÞŁꜲÆꜴꜶꜸꜺꜼǶŒꝎẞꜨꝠ]', 'A', ret)
19+
ret = re.sub('[a-zøþıłꜳæꬱꜵꜷꜹꜻꜽ🙰ꭁƕỻœꝏßꜩꝡ]', 'a', ret)
1320
ret = re.sub('[0-9]', '0', ret)
1421
return ret
22+
23+
24+
def remove_diacritics(input_str: str) -> str:
25+
"""Remove diacritics and typographical ligatures from the string.
26+
27+
- All diacritics (i.e. accents) will be removed.
28+
- Typographical ligatures (e.g. ffi) are broken into separated characters.
29+
- True linguistic ligatures (e.g. œ) will remain.
30+
- Non-latin scripts will remain.
31+
32+
Args:
33+
input_str (str): The original string with diacritics and ligatures.
34+
35+
Returns:
36+
str: The string without diacritics and typographical ligatures.
37+
"""
38+
nfkd_form = unicodedata.normalize('NFKD', input_str)
39+
return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

scripts/redact_cli_py/tests/utils/test_redact_policy.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Licensed under the MIT License. See License.txt in the project
33
# root for license information.
44

5-
from redact.utils.redact_policy import first_char
5+
from redact.utils.redact_policy import first_char, remove_diacritics
66

77

88
class TestRedactPolicy:
@@ -25,3 +25,38 @@ def test_first_char_price(self) -> None:
2525
text = "$3000.00"
2626
actual = first_char(text)
2727
assert "$0000.00" == actual
28+
29+
def test_first_char_diacritics(self) -> None:
30+
text = "Anaïs, Noël, Sørina, François, Mátyás, Agnès, Fañch, Reiß"
31+
actual = first_char(text)
32+
assert "Aaaaa, Aaaa, Aaaaaa, Aaaaaaaa, Aaaaaa, Aaaaa, Aaaaa, Aaaa" == actual
33+
34+
def test_remove_diacritics_empty(self) -> None:
35+
text = ""
36+
actual = remove_diacritics(text)
37+
assert "" == actual
38+
39+
def test_remove_diacritics_with_diacritics(self) -> None:
40+
text = "Português, Lô-má-jī"
41+
actual = remove_diacritics(text)
42+
assert "Portugues, Lo-ma-ji" == actual
43+
44+
def test_remove_diacritics_french_letters(self) -> None:
45+
text = "çéâêîôûàèìòùëïü"
46+
actual = remove_diacritics(text)
47+
assert "ceaeiouaeioueiu" == actual
48+
49+
def test_remove_diacritics_typographical_ligature(self) -> None:
50+
text = "ffffifflfiflstſt"
51+
actual = remove_diacritics(text)
52+
assert "ffffifflfiflstst" == actual
53+
54+
def test_remove_diacritics_linguistic_ligature(self) -> None:
55+
text = "ꜳæꬱꜵꜷꜹꜻꜽ🙰ꭁƕỻœꝏßꜩꝡ"
56+
actual = remove_diacritics(text)
57+
assert text == actual
58+
59+
def test_remove_diacritics_boeuf_a_la_bourguignonne(self) -> None:
60+
text = "bœuf à la Bourguignonne"
61+
actual = remove_diacritics(text)
62+
assert "bœuf a la Bourguignonne" == actual

0 commit comments

Comments
 (0)