Redaction Toolkit | Support to redact some Latin ligature letters and letters with diacritics (#1012)

cschenio · chiache-msft · web-flow · commit 2b130424f76c · 2021-12-14T16:36:33.000+08:00
## Purpose

We currently cannot handle accented letter like é and ligature letter like œ, which is common in French. Need to support this. We also expand the language coverage out of just French. Please check the code for the actual charset we have taken care of.

## Validation

Before merging this PR, please make sure below works are done and marked items with 'x'.

- [x] Your code builds clean without any errors or warnings.
- [x] You have tested your change manually.
- [x] You have added unit tests.

Co-authored-by: Chia-Sheng Chen &lt;chiache@microsoft.com&gt;
diff --git a/scripts/redact_cli_py/CHANGELOG.md b/scripts/redact_cli_py/CHANGELOG.md
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.3] - 2021-12-13
+### Added
+- Support to redact some Latin ligature letters and letters with diacritics.
+
 ## [0.2.2] - 2021-11-17
 ### Added
 - Support to only redact specific labels.
diff --git a/scripts/redact_cli_py/README.md b/scripts/redact_cli_py/README.md
@@ -11,7 +11,7 @@ The OCR.json and labels.json will also be redacted while keeping the semantics o
 ![labels-before-after-redaction](./images/labels-before-after-redaction.png)
 
 ## Version
-Redact CLI 0.2.2
+Redact CLI 0.2.3
 
 ## Setup Environment
 
diff --git a/scripts/redact_cli_py/redact/utils/redact_policy.py b/scripts/redact_cli_py/redact/utils/redact_policy.py
@@ -3,12 +3,37 @@
 # root for license information.
 
 import re
+import unicodedata
 
 
 def first_char(item: str) -> str:
     # This replace every uppercase to 'A', lowercase to 'a', digit to '0'.
     # As known as the "Aa0" policy.
-    ret = re.sub('[A-Z]', 'A', item)
-    ret = re.sub('[a-z]', 'a', ret)
+
+    # First remove all diacritics and break typographical ligatures.
+    ret = remove_diacritics(item)
+
+    # This also takes care of other common letter in Europe languages (Ø) and
+    # linguistic ligatures (Œ) instead of just A-Z.
+    ret = re.sub('[A-ZØÞŁꜲÆꜴꜶꜸꜺꜼǶŒꝎẞꜨꝠ]', 'A', ret)
+    ret = re.sub('[a-zøþıłꜳæꬱꜵꜷꜹꜻꜽ🙰ꭁƕỻœꝏßꜩꝡ]', 'a', ret)
     ret = re.sub('[0-9]', '0', ret)
     return ret
+
+
+def remove_diacritics(input_str: str) -> str:
+    """Remove diacritics and typographical ligatures from the string.
+
+    - All diacritics (i.e. accents) will be removed.
+    - Typographical ligatures (e.g. ﬃ) are broken into separated characters.
+    - True linguistic ligatures (e.g. œ) will remain.
+    - Non-latin scripts will remain.
+
+    Args:
+        input_str (str): The original string with diacritics and ligatures.
+
+    Returns:
+        str: The string without diacritics and typographical ligatures.
+    """
+    nfkd_form = unicodedata.normalize('NFKD', input_str)
+    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
diff --git a/scripts/redact_cli_py/tests/utils/test_redact_policy.py b/scripts/redact_cli_py/tests/utils/test_redact_policy.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT License. See License.txt in the project
 # root for license information.
 
-from redact.utils.redact_policy import first_char
+from redact.utils.redact_policy import first_char, remove_diacritics
 
 
 class TestRedactPolicy:
@@ -25,3 +25,38 @@ def test_first_char_price(self) -> None:
         text = "$3000.00"
         actual = first_char(text)
         assert "$0000.00" == actual
+
+    def test_first_char_diacritics(self) -> None:
+        text = "Anaïs, Noël, Sørina, François, Mátyás, Agnès, Fañch, Reiß"
+        actual = first_char(text)
+        assert "Aaaaa, Aaaa, Aaaaaa, Aaaaaaaa, Aaaaaa, Aaaaa, Aaaaa, Aaaa" == actual
+
+    def test_remove_diacritics_empty(self) -> None:
+        text = ""
+        actual = remove_diacritics(text)
+        assert "" == actual
+
+    def test_remove_diacritics_with_diacritics(self) -> None:
+        text = "Português, Lô-má-jī"
+        actual = remove_diacritics(text)
+        assert "Portugues, Lo-ma-ji" == actual
+
+    def test_remove_diacritics_french_letters(self) -> None:
+        text = "çéâêîôûàèìòùëïü"
+        actual = remove_diacritics(text)
+        assert "ceaeiouaeioueiu" == actual
+
+    def test_remove_diacritics_typographical_ligature(self) -> None:
+        text = "ﬀﬃﬄﬁﬂﬆﬅ"
+        actual = remove_diacritics(text)
+        assert "ffffifflfiflstst" == actual
+
+    def test_remove_diacritics_linguistic_ligature(self) -> None:
+        text = "ꜳæꬱꜵꜷꜹꜻꜽ🙰ꭁƕỻœꝏßꜩꝡ"
+        actual = remove_diacritics(text)
+        assert text == actual
+
+    def test_remove_diacritics_boeuf_a_la_bourguignonne(self) -> None:
+        text = "bœuf à la Bourguignonne"
+        actual = remove_diacritics(text)
+        assert "bœuf a la Bourguignonne" == actual