From 21b5d6982bbd3963be23ffc1bc7aa23a39dcc2bf Mon Sep 17 00:00:00 2001 From: tbartley94 Date: Wed, 24 Sep 2025 17:00:07 -0700 Subject: [PATCH 1/9] rebase to main Signed-off-by: tbartley94 --- Jenkinsfile | 25 ++- .../inverse_text_normalization/he/__init__.py | 13 ++ .../he/data/measurements.tsv | 45 ++++ .../he/data/months.tsv | 13 ++ .../he/data/months_name2number.tsv | 12 + .../he/data/months_ordinal2number.tsv | 12 + .../he/data/numbers/__init__.py | 13 ++ .../he/data/numbers/decimal_fractions.tsv | 6 + .../he/data/numbers/digit.tsv | 20 ++ .../he/data/numbers/teen.tsv | 21 ++ .../he/data/numbers/thousands.tsv | 8 + .../he/data/numbers/ties.tsv | 8 + .../he/data/numbers/viable_hours.tsv | 15 ++ .../he/data/numbers/zero.tsv | 1 + .../he/data/ordinals/__init__.py | 13 ++ .../he/data/ordinals/digit.tsv | 10 + .../he/data/prefix.tsv | 17 ++ .../he/data/spaced_measurements.tsv | 17 ++ .../he/data/time/__init__.py | 13 ++ .../he/data/time/minute_to.tsv | 58 +++++ .../he/data/time/time_suffix.tsv | 8 + .../he/data/time/to_hour.tsv | 13 ++ .../he/data/whitelist.tsv | 20 ++ .../he/graph_utils.py | 121 ++++++++++ .../he/taggers/__init__.py | 13 ++ .../he/taggers/cardinal.py | 161 ++++++++++++++ .../he/taggers/date.py | 107 +++++++++ .../he/taggers/decimal.py | 149 +++++++++++++ .../he/taggers/measure.py | 105 +++++++++ .../he/taggers/ordinal.py | 43 ++++ .../he/taggers/punctuation.py | 35 +++ .../he/taggers/time.py | 207 ++++++++++++++++++ .../he/taggers/tokenize_and_classify.py | 101 +++++++++ .../he/taggers/whitelist.py | 56 +++++ .../he/taggers/word.py | 31 +++ .../inverse_text_normalization/he/utils.py | 178 +++++++++++++++ .../he/verbalizers/__init__.py | 13 ++ .../he/verbalizers/cardinal.py | 84 +++++++ .../he/verbalizers/date.py | 121 ++++++++++ .../he/verbalizers/decimal.py | 90 ++++++++ .../he/verbalizers/measure.py | 89 ++++++++ .../he/verbalizers/ordinal.py | 38 ++++ .../he/verbalizers/time.py | 122 +++++++++++ .../he/verbalizers/verbalize.py | 54 +++++ .../he/verbalizers/verbalize_final.py | 44 ++++ .../he/verbalizers/whitelist.py | 50 +++++ .../he/verbalizers/word.py | 34 +++ .../inverse_normalize.py | 7 +- tests/nemo_text_processing/he/__init__.py | 13 ++ .../test_cases_cardinal.txt | 138 ++++++++++++ .../test_cases_date.txt | 29 +++ .../test_cases_decimal.txt | 64 ++++++ .../test_cases_measure.txt | 9 + .../test_cases_time.txt | 34 +++ .../test_cases_whitelist.txt | 4 + .../test_full_sentences.txt | 56 +++++ .../nemo_text_processing/he/test_cardinal.py | 31 +++ tests/nemo_text_processing/he/test_date.py | 31 +++ tests/nemo_text_processing/he/test_decimal.py | 31 +++ .../he/test_full_sentences.py | 31 +++ tests/nemo_text_processing/he/test_measure.py | 31 +++ ..._sparrowhawk_inverse_text_normalization.sh | 61 ++++++ tests/nemo_text_processing/he/test_time.py | 31 +++ .../nemo_text_processing/he/test_whitelist.py | 31 +++ .../pynini_export.py | 10 + 65 files changed, 3067 insertions(+), 2 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/he/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/measurements.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/months.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/months_name2number.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/months_ordinal2number.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/decimal_fractions.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/teen.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/ties.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/viable_hours.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/zero.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/ordinals/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/prefix.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/spaced_measurements.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/minute_to.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/time_suffix.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/to_hour.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/graph_utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/date.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/measure.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/time.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/word.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py create mode 100644 tests/nemo_text_processing/he/__init__.py create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_date.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_full_sentences.txt create mode 100644 tests/nemo_text_processing/he/test_cardinal.py create mode 100644 tests/nemo_text_processing/he/test_date.py create mode 100644 tests/nemo_text_processing/he/test_decimal.py create mode 100644 tests/nemo_text_processing/he/test_full_sentences.py create mode 100644 tests/nemo_text_processing/he/test_measure.py create mode 100644 tests/nemo_text_processing/he/test_sparrowhawk_inverse_text_normalization.sh create mode 100644 tests/nemo_text_processing/he/test_time.py create mode 100644 tests/nemo_text_processing/he/test_whitelist.py diff --git a/Jenkinsfile b/Jenkinsfile index 3781a171d..69c0e879b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -24,6 +24,7 @@ pipeline { SV_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' ZH_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/11-13-24-0' IT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-22-24-0' + HE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-24-25-0' HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' @@ -253,7 +254,24 @@ pipeline { } } } - + stage('L0: Create He TN/ITN Grammars & MR') { + when { + anyOf { + branch 'main' + branch 'staging/**' + branch 'staging_*' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: HE ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=he --text="ת " --cache_dir ${HE_TN_CACHE}' + } + } + } + } stage('L0: Create HY TN/ITN Grammars & MR') { when { anyOf { @@ -413,6 +431,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}' } } + stage('L1: Run all HE TN/ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/he/ -m "not pleasefixme" --cpu --tn_cache_dir ${HE_TN_CACHE}' + } + } } } diff --git a/nemo_text_processing/inverse_text_normalization/he/__init__.py b/nemo_text_processing/inverse_text_normalization/he/__init__.py new file mode 100644 index 000000000..bc443be41 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/measurements.tsv b/nemo_text_processing/inverse_text_normalization/he/data/measurements.tsv new file mode 100644 index 000000000..fbd061bc5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/measurements.tsv @@ -0,0 +1,45 @@ +°F פרנהייט +°C צלסיוס +° מעלות +°F מעלות פרנהייט +°C מעלות צלסיוס +K קלווין +% אחוז +% אחוזים +Hz הרץ +kW קילוואט +kW קילו ואט +kW קילו וואט +kWh קילו ואט לשעה +kWh קילוואט לשעה +Wh ואט לשעה +W ואט +ghz ג׳יגה הרץ +ghz גיגה הרץ +khz קילו הרץ +mhz מגה הרץ +v וולט +nm ננומטר +mA מילי אמפר +tW טרה ואט +mv מילי וולט +mW מגה ואט +μm מיקרומטר +" אינץ׳ +cc סי סי +ω אוהם +db דציבל +db דציבלים +kb קילו ביט +mb מגה ביט +gb ג׳יגה ביט +gb גיגה ביט +tb טרה ביט +pb פטה ביט +mb מגה בייט +kb קילו בייט +gb ג׳יגה בייט +gb גיגה בייט +tb טרה בייט +pb פטה בייט +A אמפר \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/months.tsv b/nemo_text_processing/inverse_text_normalization/he/data/months.tsv new file mode 100644 index 000000000..05415cc3d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/months.tsv @@ -0,0 +1,13 @@ +ינואר +פברואר +מרץ +מרס +אפריל +מאי +יוני +יולי +אוגוסט +ספטמבר +אוקטובר +נובמבר +דצמבר \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/months_name2number.tsv b/nemo_text_processing/inverse_text_normalization/he/data/months_name2number.tsv new file mode 100644 index 000000000..651118ca1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/months_name2number.tsv @@ -0,0 +1,12 @@ +ינואר 1 +פברואר 2 +מרץ 3 +אפריל 4 +מאי 5 +יוני 6 +יולי 7 +אוגוסט 8 +ספטמבר 9 +אוקטובר 10 +נובמבר 11 +דצמבר 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/months_ordinal2number.tsv b/nemo_text_processing/inverse_text_normalization/he/data/months_ordinal2number.tsv new file mode 100644 index 000000000..e75a452d8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/months_ordinal2number.tsv @@ -0,0 +1,12 @@ +ראשון 1 +שני 2 +שלישי 3 +רביעי 4 +חמישי 5 +שישי 6 +שביעי 7 +שמיני 8 +תשיעי 9 +עשירי 10 +אחת עשרה 11 +שתיים עשרה 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py new file mode 100644 index 000000000..bc443be41 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/decimal_fractions.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/decimal_fractions.tsv new file mode 100644 index 000000000..d88316454 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/decimal_fractions.tsv @@ -0,0 +1,6 @@ +חצי 5 +רבע 25 +שלושת רבעי 75 +עשירית 1 +שתי עשיריות 2 +חמישית 2 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/digit.tsv new file mode 100644 index 000000000..68c02dd42 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/digit.tsv @@ -0,0 +1,20 @@ +אחד 1 +שניים 2 +שני 2 +שלושה 3 +ארבעה 4 +חמישה 5 +שישה 6 +שבעה 7 +שמונה 8 +תשעה 9 +אחת 1 +שתיים 2 +שתים 2 +שתי 2 +שלוש 3 +ארבע 4 +חמש 5 +שש 6 +שבע 7 +תשע 9 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/teen.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/teen.tsv new file mode 100644 index 000000000..26f1a5a4c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/teen.tsv @@ -0,0 +1,21 @@ +עשר 10 +אחד עשר 11 +שניים עשר 12 +שלושה עשר 13 +ארבעה עשר 14 +חמישה עשר 15 +שישה עשר 16 +שבעה עשר 17 +שמונה עשר 18 +תשעה עשר 19 +עשרה 10 +אחת עשרה 11 +שתיים עשרה 12 +שתים עשרה 12 +שלוש עשרה 13 +ארבע עשרה 14 +חמש עשרה 15 +שש עשרה 16 +שבע עשרה 17 +שמונה עשרה 18 +תשע עשרה 19 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands.tsv new file mode 100644 index 000000000..534789509 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands.tsv @@ -0,0 +1,8 @@ +שלושת 3 +ארבעת 4 +חמשת 5 +ששת 6 +שבעת 7 +שמונת 8 +תשעת 9 +עשרת 10 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/ties.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/ties.tsv new file mode 100644 index 000000000..b6dd59ca3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/ties.tsv @@ -0,0 +1,8 @@ +עשרים 2 +שלושים 3 +ארבעים 4 +חמישים 5 +שישים 6 +שבעים 7 +שמונים 8 +תשעים 9 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/viable_hours.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/viable_hours.tsv new file mode 100644 index 000000000..6a2cb1307 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/viable_hours.tsv @@ -0,0 +1,15 @@ +אחד 1 +אחת 1 +שתיים 2 +שתים 2 +שלוש 3 +ארבע 4 +חמש 5 +שש 6 +שבע 7 +שמונה 8 +תשע 9 +עשר 10 +אחת עשרה 11 +שתיים עשרה 12 +שתים עשרה 12 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/zero.tsv new file mode 100644 index 000000000..a0b033c5d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/zero.tsv @@ -0,0 +1 @@ +אפס 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py new file mode 100644 index 000000000..bc443be41 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/ordinals/digit.tsv b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/digit.tsv new file mode 100644 index 000000000..036e1433a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/digit.tsv @@ -0,0 +1,10 @@ +ראשון אחד +שני שניים +שלישי שלושה +רביעי ארבעה +חמישי חמישה +שישי שישה +שביעי שבעה +שמיני שמונה +תשיעי תשעה +עשירי עשרה \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/prefix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/prefix.tsv new file mode 100644 index 000000000..988d6aedf --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/prefix.tsv @@ -0,0 +1,17 @@ +וה +שה +ב +כ +ל +מ +ה +ו +וב +ול +ש +מה +ומ +שכ +שב +בכ +לכ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/spaced_measurements.tsv b/nemo_text_processing/inverse_text_normalization/he/data/spaced_measurements.tsv new file mode 100644 index 000000000..a97b03412 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/spaced_measurements.tsv @@ -0,0 +1,17 @@ +ק״מ קילומטר +ק״מ קילומטרים +מ׳ מטר +מ׳ מטרים +ס״מ סנטימטר +ס״מ סנטימטרים +מ״מ מילימטר +מ״מ מילימטרים +מ״ג מיליגרם +מ״ג מיליגרמים +מ״ל מיליליטר +ק״ג קילוגרם +ק״ג קילוגרמים +קמ״ש קילומטר לשעה +קמ״ש קילומטרים לשעה +ג׳ גרם +ג׳ גרמים \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py new file mode 100644 index 000000000..bc443be41 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to.tsv new file mode 100644 index 000000000..38858859c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to.tsv @@ -0,0 +1,58 @@ +02 58 +03 57 +04 56 +05 55 +06 54 +07 53 +08 52 +09 51 +10 50 +11 49 +12 48 +13 47 +14 46 +15 45 +16 44 +17 43 +18 42 +19 41 +20 40 +21 39 +22 38 +23 37 +24 36 +25 35 +26 34 +27 33 +28 32 +29 31 +30 30 +31 29 +32 28 +33 27 +34 26 +35 25 +36 24 +37 23 +38 22 +39 21 +40 20 +41 19 +42 18 +43 17 +44 16 +45 15 +46 14 +47 13 +48 12 +49 11 +50 10 +51 09 +52 08 +53 07 +54 06 +55 05 +56 04 +57 03 +58 01 +59 01 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/time_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/time_suffix.tsv new file mode 100644 index 000000000..b5799a0b9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/time_suffix.tsv @@ -0,0 +1,8 @@ +בבוקר +לפנות בוקר +לפנות ערב +בערב +בצהריים +בלילה +אחרי הצהריים +אחר הצהריים \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/to_hour.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/to_hour.tsv new file mode 100644 index 000000000..5689943fd --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/to_hour.tsv @@ -0,0 +1,13 @@ +אחת 12 +שתיים 1 +שלוש 2 +ארבע 3 +חמש 4 +שש 5 +שבע 6 +שמונה 7 +תשע 8 +עשר 9 +אחת עשרה 10 +שתיים עשרה 11 +חצות 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv new file mode 100644 index 000000000..9844685c5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv @@ -0,0 +1,20 @@ +אח״כ אחר כך +וכו׳ וכולי +בריה״מ ברית המועצות +ארה״ב ארצות הברית +עו״ד עורך דין +עו״ד עורכת דין +עו״ד עורכי דין +עו״ד עורכות דין +רו״ח רואה חשבון +רו״ח רואת חשבון +רו״ח רואי חשבון +רו״ח רואות חשבון +לפנה״ס לפני הספירה +ד״ר דוקטור +פרופ׳ פרופסור +אמא אימא +כל כול +מאיתנו מאתנו +ישארו יישארו +יתכן ייתכן \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/graph_utils.py b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py new file mode 100644 index 000000000..7d96154a9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py @@ -0,0 +1,121 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path + +import pynini +from pynini import Far +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, delete_space +from nemo_text_processing.text_normalization.en.utils import load_labels + + +NEMO_ALPHA_HE = pynini.union(*"אבגדהוזחטיכלמםנןסעפףצץקרשת").optimize() + +delete_optional_and = pynini.closure(pynutil.delete("ו")) +delete_and = pynini.cross("ו", "") + +#################### +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +MINUS = pynini.union("מינוס").optimize() + + +def string_map_cased(input_file: str): + labels = load_labels(input_file) + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +def apply_fst(text, fst): + """ Given a string input, returns the output string + produced by traversing the path with lowest weight. + If no valid path accepts input string, returns an + error. + """ + try: + print(pynini.shortestpath(text @ fst).string()) + except pynini.FstOpError: + print(f"Error: No valid output with given input: '{text}'") + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> 'pynini.FstLike': + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> 'pynini.FstLike': + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> 'pynini.FstLike': + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py new file mode 100644 index 000000000..bc443be41 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py new file mode 100644 index 000000000..49d3a26ec --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py @@ -0,0 +1,161 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + NEMO_ALPHA_HE, + GraphFst, + delete_and, + delete_optional_and, +) +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + NEMO_SIGMA, + NEMO_SPACE, + insert_space, + delete_space, +) +from nemo_text_processing.text_normalization.en.utils import load_labels + + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals in Hebrew + e.g. מינוס עשרים ושלוש ("minus twenty three" in Hebrew)-> cardinal { negative: "-" integer: "23" } } + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + + # digits + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + + # teens + graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) + graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) + graph_ties += pynini.union(delete_space + delete_and + graph_digit, pynutil.insert("0", weight=0.001)) + + graph_two_digit = pynini.union( + graph_teen, graph_ties + ) + self.graph_two_digit = graph_two_digit | graph_digit + + # hundreds + hundred = pynini.string_map([("מאה", "1"), ("מאתיים", "2")]) + delete_hundred = pynutil.delete("מאות") + graph_hundred = delete_optional_and + pynini.union( + hundred, + graph_digit + delete_space + delete_hundred, + pynutil.insert("0", weight=0.001), + ) + graph_hundred += delete_space + graph_hundred += pynini.union( + delete_optional_and + graph_two_digit, + pynutil.insert("0") + delete_space + delete_and + graph_digit, + pynutil.insert("00", weight=0.001), + ) + graph_hundred = pynini.union( + graph_hundred, + pynutil.insert("0") + graph_two_digit, + pynutil.insert("00") + graph_digit + ) + + self.graph_hundred = graph_hundred @ ( + pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) + ) + + # thousands + thousand = pynini.string_map([("אלף", "1"), ("אלפיים", "2")]) + thousand_digit = pynini.string_file(get_abs_path("data/numbers/thousands.tsv")) + delete_thousand = pynutil.delete("אלפים") | pynutil.delete("אלף", weight=0.001) + + large_number_prefix = pynini.union( + graph_hundred, + pynutil.insert("0") + graph_two_digit, + pynutil.insert("00") + thousand_digit + ) + many_thousands = large_number_prefix + delete_space + delete_thousand + + graph_thousands = delete_optional_and + pynini.union( + (pynutil.insert("00") + thousand), + many_thousands, + pynutil.insert("000", weight=0.001) + ) + + self.graph_thousands = pynini.union(graph_thousands + delete_space + graph_hundred, graph_zero) + self.graph_thousands @= pynini.union( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" + ) + + # millions + million = pynini.string_map([("מיליון", "001")]) + + delete_millions = pynutil.delete("מיליונים") | pynutil.delete("מיליון", weight=0.001) + many_millions = large_number_prefix + delete_space + delete_millions + + graph_millions = pynini.union(many_millions, million, pynutil.insert("000", weight=0.001)) + + graph = pynini.union( + graph_millions + delete_space + graph_thousands + delete_space + graph_hundred, + graph_zero + ) + + graph = graph @ pynini.union( + pynutil.delete(pynini.closure("0")) + + pynini.difference(NEMO_DIGIT, "0") + + pynini.closure(NEMO_DIGIT), "0" + ) + + labels_exception = load_labels(get_abs_path("data/numbers/digit.tsv")) + labels_exception = list(set([x[0] for x in labels_exception] + ["אפס", "עשר", "עשרה"])) + labels_exception += ["ו" + label for label in labels_exception] + graph_exception = pynini.union(*labels_exception).optimize() + + graph = ((NEMO_ALPHA_HE + NEMO_SIGMA) @ graph).optimize() + + self.graph_no_exception = graph + + ### Token insertion + minus_graph = pynutil.insert("negative: ") + pynini.cross("מינוס", "\"-\"") + NEMO_SPACE + optional_minus_graph = pynini.closure(minus_graph, 0, 1) + + optional_prefix_graph = pynini.closure( + pynutil.insert("morphosyntactic_features: \"") + prefix_graph + pynutil.insert("\"") + insert_space, 0, 1 + ) + + graph_wo_small_digits = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph + + cardinal_wo_viable_hours = load_labels(get_abs_path("data/numbers/viable_hours.tsv")) + cardinal_wo_viable_hours = list(set([x[0] for x in cardinal_wo_viable_hours])) + viable_hours_exception = pynini.union(*cardinal_wo_viable_hours).optimize() + self.graph_wo_viable_hours = (pynini.project(graph, "input") - viable_hours_exception.arcsort()) @ graph + + small_number_with_minus = ( + insert_space + minus_graph + pynutil.insert("integer: \"") + self.graph_no_exception + pynutil.insert("\"") + ) + + big_number_with_optional_minus = ( + optional_minus_graph + pynutil.insert("integer: \"") + graph_wo_small_digits + pynutil.insert("\"") + ) + + graph = optional_prefix_graph + (small_number_with_minus | big_number_with_optional_minus) + + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py new file mode 100644 index 000000000..3dae17f2b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py @@ -0,0 +1,107 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + delete_extra_space, + delete_space, + insert_space, +) + + +def _get_year_graph(graph_two_digits, graph_thousands): + """ + Transducer for year, e.g. twenty twenty -> 2020 + """ + year_graph = pynini.union( + (graph_two_digits + delete_space + graph_two_digits), graph_thousands # 20 19, 40 12, 20 20 + ) # 2012 - assuming no limit on the year + + year_graph.optimize() + return year_graph + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying date in Hebrew, + e.g. אחד במאי אלף תשע מאות שמונים ושלוש -> date { day: "1" month_prefix: "ב" month: "5" year: "1983" } + e.g. הראשון ביוני אלפיים ושתיים עשרה -> date { day_prefix: "ה" day: "1" month_prefix: "ב" month: "6" year: "2012" } + e.g. העשירי ביוני -> date { day_prefix: "ה" day: "10" month_prefix: "ב" month: "6" } + e.g. מרץ אלף תשע מאות שמונים ותשע -> date { month: "מרץ" year: "1989" } + e.g. בינואר עשרים עשרים -> date { month_prefix: "ב" month: "ינואר" year: "2020" } + + Args: + cardinal: CardinalFst + ordinal: OrdinalFst + """ + + def __init__(self, cardinal: GraphFst, ordinal: GraphFst): + super().__init__(name="date", kind="classify") + + ordinal_graph = ordinal.graph + two_digits_graph = cardinal.graph_two_digit + + day_graph = pynutil.add_weight(two_digits_graph | ordinal_graph, -0.7) + day_graph = pynutil.insert("day: \"") + day_graph + pynutil.insert("\"") + + month_names = pynini.string_file(get_abs_path("data/months.tsv")) + month_names_graph = pynutil.insert("month: \"") + month_names + pynutil.insert("\"") + + month_name2number = pynini.string_file(get_abs_path("data/months_name2number.tsv")) + month_name2number_graph = pynutil.insert("month: \"") + month_name2number + pynutil.insert("\"") + + month_number2number = pynini.string_file(get_abs_path("data/months_ordinal2number.tsv")) + month_number2number_graph = (pynutil.insert("month: \"") + month_number2number + pynutil.insert("\"")) + + all_month_graph = month_name2number_graph | month_number2number_graph + + year_graph = _get_year_graph(two_digits_graph, cardinal.graph_thousands) + graph_year = delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\"") + + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + delete_prefix = pynutil.delete(prefix_graph) + + graph_prefix = pynutil.insert("morphosyntactic_features: \"") + prefix_graph + pynutil.insert("\"") + year_prefix_graph = pynutil.insert("morphosyntactic_features: \"") + pynini.closure(prefix_graph, 0, 1) + pynini.union("שנה", "שנת") + pynutil.insert("\"") + + + graph_dm = ( + pynini.closure(graph_prefix + insert_space, 0, 1) + + day_graph + + insert_space + + delete_space + + pynini.closure(delete_prefix + insert_space, 0, 1) + + month_name2number_graph + ) + + graph_dmy = ( + pynini.closure(graph_prefix + insert_space, 0, 1) + + day_graph + + insert_space + + delete_space + + pynini.closure(delete_prefix + insert_space, 0, 1) + + all_month_graph + + graph_year + ) + + graph_my = pynini.closure(graph_prefix + insert_space, 0, 1) + month_names_graph + graph_year + graph_y_only = year_prefix_graph + graph_year + + final_graph = graph_dm | graph_dmy | graph_my | graph_y_only + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py new file mode 100644 index 000000000..4866ba1bc --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py @@ -0,0 +1,149 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + MINUS, + GraphFst, + delete_and, +) +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike': + """ + Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral in Hebrew, + e.g. one million -> integer_part: "1" quantity: "million" + e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million" + + Args: + decimal: decimal FST + cardinal_up_to_hundred: cardinal FST + """ + numbers = cardinal_up_to_hundred @ ( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + ) + + suffix_labels = ["אלף", "מיליון", "מיליארד"] + suffix_labels = [x for x in suffix_labels if x != "אלף"] + suffix = pynini.union(*suffix_labels).optimize() + + res = ( + pynutil.insert("integer_part: \"") + + numbers + + pynutil.insert("\"") + + delete_extra_space + + pynutil.insert("quantity: \"") + + suffix + + pynutil.insert("\"") + ) + res |= decimal + delete_extra_space + pynutil.insert("quantity: \"") + (suffix | "אלף") + pynutil.insert("\"") + + return res + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal in Hebrew + e.g. עשרים ושלוש וחצי -> decimal { integer_part: "23" fractional_part: "5" } + e.g. אחד נקודה שלוש -> decimal { integer_part: "1" fractional_part: "3" } + e.g. ארבע נקודה חמש מיליון -> decimal { integer_part: "4" fractional_part: "5" quantity: "מיליון" } + e.g. מינוס ארבע מאות נקודה שלוש שתיים שלוש -> decimal { negative: "true" integer_part: "400" fractional_part: "323" } + e.g. אפס נקודה שלושים ושלוש -> decimal { integer_part: "0" fractional_part: "33" } + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="decimal", kind="classify") + + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + optional_prefix_graph = pynini.closure( + pynutil.insert("morphosyntactic_features: \"") + prefix_graph + pynutil.insert("\"") + insert_space, 0, 1 + ) + + # all cardinals + cardinal_graph = cardinal.graph_no_exception + + # all fractions + fractions = pynini.string_file(get_abs_path("data/numbers/decimal_fractions.tsv")) + fractions_graph = delete_zero_or_one_space + delete_and + fractions + fractions_graph = pynutil.insert("fractional_part: \"") + fractions_graph + pynutil.insert("\"") + + # identify decimals that can be understood time, and don't convert them to avoid ambiguity + viable_minutes_verbose = ["חצי", "רבע"] + viable_minutes_exception = pynini.union(*viable_minutes_verbose).optimize() + fractions_wo_minutes = (pynini.project(fractions, "input") - viable_minutes_exception.arcsort()) @ fractions + fractions_wo_minutes = delete_zero_or_one_space + delete_and + fractions_wo_minutes + fractions_wo_minutes = pynutil.insert("fractional_part: \"") + fractions_wo_minutes + pynutil.insert("\"") + + graph_decimal = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + graph_decimal |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_decimal |= cardinal.graph_two_digit + + graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal + self.graph = graph_decimal + + point = pynutil.delete("נקודה") + + graph_negative = pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space + optional_graph_negative = pynini.closure(graph_negative, 0, 1,) + + graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") + + # integer could be an hour, but minutes cannot: convert to decimal + viable_hour_unviable_minutes = graph_integer + delete_extra_space + fractions_wo_minutes + + # integer cannot be an hour, but minutes can: convert to decimal + unviable_hour_viable_minutes = ( + pynutil.insert("integer_part: \"") + + cardinal.graph_wo_viable_hours + + pynutil.insert("\"") + + delete_extra_space + + fractions_graph + ) + + # minus sign followed by ambiguous decimal: convert to decimal, there is no negative time + negative_viable_time = graph_negative + graph_integer + delete_extra_space + fractions_graph + + # all decimals with fractions, not excluding anything (used in other FSTs) + all_decimals_wo_point = graph_integer + delete_extra_space + fractions_graph + + # only cases with fractional part that cannot be interpreted as time + graph_wo_point = viable_hour_unviable_minutes | unviable_hour_viable_minutes | negative_viable_time + + # all decimals with the word "point" + graph_w_point = ( + pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional + ) + + final_graph_wo_sign = graph_w_point | graph_wo_point + self.final_graph_wo_sign = graph_w_point | all_decimals_wo_point + final_graph = optional_prefix_graph + optional_graph_negative + final_graph_wo_sign + + quantity_graph = get_quantity(self.final_graph_wo_sign, cardinal.graph_hundred) + final_graph |= optional_prefix_graph + optional_graph_negative + quantity_graph + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py new file mode 100644 index 000000000..31c0a7286 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py @@ -0,0 +1,105 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_SPACE, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure in Hebrew + e.g. מש עשרה אחוז -> measure { cardinal { integer: "15" } units: "%" } + e.g. מינוס חמש עשרה אחוז -> measure { negative: "-" cardinal { integer: "15" } units: "%" } + e.g. שלוש מיליגרם -> measure { cardinal { integer: "3" } spaced_units: "מ״ג" } + e.g. אלף אחוז -> measure { cardinal { integer: "1000" } units: "%" } + e.g. אחוז אחד -> measure { units: "%" cardinal { integer: "1" } } + e.g. סנטימטר אחד -> measure { spaced_units: "ס״מ" cardinal { integer: "1" } } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + """ + + def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): + super().__init__(name="measure", kind="classify") + + # optional negative sign + optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("מינוס", "\"-\"") + NEMO_SPACE, 0, 1 + ) + + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + optional_prefix_graph = pynini.closure( + pynutil.insert("morphosyntactic_features: \"") + prefix_graph + pynutil.insert("\"") + insert_space, 0, 1 + ) + + # cardinal numbers + cardinal_graph = cardinal.graph_no_exception + + # Let singular apply to values > 1 as they could be part of an adjective phrase (e.g. 14 foot tall building) + subgraph_decimal = ( + pynutil.insert("decimal { ") + decimal.final_graph_wo_sign + pynutil.insert(" }") + delete_extra_space + ) + + subgraph_cardinal = ( + pynutil.insert("cardinal { ") + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + delete_extra_space + ) + + # convert units + joined_units = pynini.string_file(get_abs_path("data/measurements.tsv")) + joined_units = pynini.invert(joined_units) + joined_units = pynutil.insert("units: \"") + joined_units + pynutil.insert("\"") + + spaced_units = pynini.string_file(get_abs_path("data/spaced_measurements.tsv")) + spaced_units = pynini.invert(spaced_units) + spaced_units = pynutil.insert("units: \"\[SPACE\]") + spaced_units + pynutil.insert("\"") + + # in joint units the unit is concatenated to the number, in spaced unit separate the unit with a space + units_graph = joined_units | spaced_units + + # one graph is needed since it changed the order of the words. + # We say "ten percent" for 10% but "percent one" for 1% + one = pynini.string_map([("אחד", "1")]) + one_graph = ( + insert_space + + pynutil.insert("cardinal { ") + + pynutil.insert("integer: \"") + + one + + pynutil.insert("\"") + + pynutil.insert(" }") + ) + + number_graph = subgraph_decimal | subgraph_cardinal + number_unit_graph = (number_graph + units_graph) | (units_graph + delete_space + one_graph) + + final_graph = optional_prefix_graph + optional_graph_negative + number_unit_graph + delete_zero_or_one_space + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py new file mode 100644 index 000000000..f1205de22 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying ordinal in Hebrew + e.g. ראשון -> ordinal { integer: "1" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="ordinal", kind="classify") + + cardinal_graph = cardinal.graph_no_exception + graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) + graph = NEMO_SIGMA + graph_digit + + self.graph = graph @ cardinal_graph + + final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py new file mode 100644 index 000000000..8dae89220 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py @@ -0,0 +1,35 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. a, -> tokens { name: "a" } tokens { name: "," } + """ + + def __init__(self): + super().__init__(name="punctuation", kind="classify") + + s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~" + punct = pynini.union(*s) + + graph = pynutil.insert("name: \"") + punct + pynutil.insert("\"") + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py new file mode 100644 index 000000000..bf9bf39a1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py @@ -0,0 +1,207 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, delete_and +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path, integer_to_text +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time in Hebrew. + Conversion is made only when am / pm time is not ambiguous! + e.g. שלוש דקות לחצות -> time { minutes: "57" hours: "23" } + e.g. באחת ושתי דקות בצהריים -> time { prefix: "ב" hours: "1" minutes: "02" suffix: "צהריים" } + e.g. שתיים ועשרה בבוקר -> time { hours: "2" minutes: "10" suffix: "בוקר" } + e.g. שתיים ועשרה בצהריים -> time { hours: "2" minutes: "10" suffix: "צהריים" } + e.g. שתיים עשרה ושלוש דקות אחרי הצהריים -> time { hours: "12" minutes: "03" suffix: "צהריים" } + e.g. רבע לשש בערב -> time { minutes: "45" hours: "5" suffix: "ערב" } + + """ + + def __init__(self): + super().__init__(name="time", kind="classify") + + # hours, minutes, seconds, suffix, zone, style, speak_period + to_hour_graph = pynini.string_file(get_abs_path("data/time/to_hour.tsv")) + minute_to_graph = pynini.string_file(get_abs_path("data/time/minute_to.tsv")) + suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv")) + + time_prefix = pynini.string_file(get_abs_path("data/prefix.tsv")) + time_prefix_graph = pynutil.insert("morphosyntactic_features: \"") + time_prefix + pynutil.insert("\"") + insert_space + + optional_time_prefix_graph = pynini.closure(time_prefix_graph, 0, 1) + + graph_minute_verbose = pynini.string_map( + [ + ("שלושת רבעי", "45"), + ("חצי", "30"), + ("רבע", "15"), + ("עשרים", "20"), + ("עשרה", "10"), + ("חמישה", "05"), + ("דקה", "01"), + ("שתי", "02"), + ] + ) + + graph_minute_to_verbose = pynini.string_map( + [("רבע", "45"), ("עשרה", "50"), ("חמישה", "55"), ("עשרים", "40"), ("עשרים וחמישה", "35"), ("דקה", "59"),] + ) + + # only used for < 1000 thousand -> 0 weight + cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) + + labels_hour = [integer_to_text(x, only_fem=True)[0] for x in range(1, 13)] + labels_minute_single = [integer_to_text(x, only_fem=True)[0] for x in range(2, 10)] + labels_minute_double = [integer_to_text(x, only_fem=True)[0] for x in range(10, 60)] + + midnight = pynini.string_map([("חצות", "0")]) + graph_hour = pynini.union(*labels_hour) @ cardinal + graph_hour |= midnight + add_leading_zero_to_double_digit = pynutil.insert("0") + NEMO_DIGIT + graph_minute_single = pynini.union(*labels_minute_single) @ cardinal @ add_leading_zero_to_double_digit + graph_minute_double = pynini.union(*labels_minute_double) @ cardinal + + final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"") + + graph_minute = pynini.union(pynutil.insert("00"), graph_minute_single, graph_minute_double) + + final_suffix = pynutil.insert("suffix: \"") + suffix_graph + pynutil.insert("\"") + final_suffix = delete_space + insert_space + final_suffix + + graph_h_and_m = ( + final_graph_hour + + delete_space + + delete_and + + insert_space + + pynutil.insert("minutes: \"") + + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose) + + pynutil.insert("\"") + + (pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1)) + ) + + graph_special_m_to_h_suffix_time = ( + pynutil.insert("minutes: \"") + + graph_minute_to_verbose + + pynutil.insert("\"") + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert("hours: \"") + + to_hour_graph + + pynutil.insert("\"") + ) + + graph_m_to_h_suffix_time = ( + pynutil.insert("minutes: \"") + + pynini.union(graph_minute_single, graph_minute_double) @ minute_to_graph + + pynutil.insert("\"") + + pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1) + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert("hours: \"") + + to_hour_graph + + pynutil.insert("\"") + ) + + graph_h = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + final_graph_hour + + delete_extra_space + + pynutil.insert("minutes: \"") + + (pynutil.insert("00") | graph_minute) + + pynutil.insert("\"") + + final_suffix + ) + + midnight_graph = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + pynutil.insert("hours: \"") + + midnight + + pynutil.insert("\"") + + insert_space + + pynutil.insert("minutes: \"") + + (pynutil.insert("00") | graph_minute) + + pynutil.insert("\"") + ) + + graph_midnight_and_m = ( + pynutil.insert("hours: \"") + + midnight + + pynutil.insert("\"") + + delete_space + + delete_and + + insert_space + + pynutil.insert("minutes: \"") + + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose) + + pynutil.insert("\"") + + (pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1)) + ) + + to_midnight_verbose_graph = ( + pynutil.insert("minutes: \"") + + graph_minute_to_verbose + + pynutil.insert("\"") + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert("hours: \"") + + to_hour_graph + + pynutil.insert("\"") + ) + + graph_m_to_midnight = ( + pynutil.insert("minutes: \"") + + pynini.union(graph_minute_single, graph_minute_double) @ minute_to_graph + + pynutil.insert("\"") + + pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1) + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert("hours: \"") + + to_hour_graph + + pynutil.insert("\"") + ) + + final_graph_midnight = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + (midnight_graph | to_midnight_verbose_graph | graph_m_to_midnight | graph_midnight_and_m) + ) + + final_graph = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + (graph_h_and_m | graph_special_m_to_h_suffix_time | graph_m_to_h_suffix_time) + + final_suffix + ) + final_graph |= graph_h + final_graph |= final_graph_midnight + + final_graph = self.add_tokens(final_graph.optimize()) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..397525a44 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py @@ -0,0 +1,101 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.he.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.he.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.he.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.he.taggers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, generator_main + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None, input_case: str = None + ): + + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"he_itn.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst(cardinal) + + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + + measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst + date_graph = DateFst(ordinal=ordinal, cardinal=cardinal).fst + word_graph = WordFst().fst + time_graph = TimeFst().fst + whitelist_graph = WhiteListFst(input_file=whitelist).fst + punct_graph = PunctuationFst().fst + + classify = ( + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(date_graph, 1.09) + | pynutil.add_weight(decimal_graph, 1.1) + | pynutil.add_weight(measure_graph, 1.1) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(word_graph, 100) + # NOTE: we convert ordinals in Hebrew only if it is a part of a date! this is why it is missing. + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py new file mode 100644 index 000000000..4cce7ebee --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py @@ -0,0 +1,56 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + GraphFst, + string_map_cased, +) +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import insert_space, convert_space + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelisted tokens + e.g. misses -> tokens { name: "mrs." } + This class has highest priority among all classifier grammars. + Whitelisted tokens are defined and loaded from "data/whitelist.tsv" (unless input_file specified). + + Args: + input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n), + e.g. nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv + """ + + def __init__(self, input_file: str = None): + super().__init__(name="whitelist", kind="classify") + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + + if input_file is None: + input_file = get_abs_path("data/whitelist.tsv") + + if not os.path.exists(input_file): + raise ValueError(f"Whitelist file {input_file} not found") + + optional_prefix_graph = pynini.closure( + pynutil.insert("morphosyntactic_features: \"") + prefix_graph + pynutil.insert("\"") + insert_space, 0, 1 + ) + whitelist = string_map_cased(input_file) + graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"") + final_graph = optional_prefix_graph + graph + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py new file mode 100644 index 000000000..142036ace --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> tokens { name: "sleep" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/utils.py b/nemo_text_processing/inverse_text_normalization/he/utils.py new file mode 100644 index 000000000..90a53e521 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/utils.py @@ -0,0 +1,178 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +#################### +# HEBREW CONSTANTS # +#################### +units_feminine_dict = { + "0": "אפס", + "1": "אחת", + "2": "שתיים", + "3": "שלוש", + "4": "ארבע", + "5": "חמש", + "6": "שש", + "7": "שבע", + "8": "שמונה", + "9": "תשע", +} + +units_masculine_dict = { + "0": "אפס", + "1": "אחד", + "2": "שניים", + "3": "שלושה", + "4": "ארבעה", + "5": "חמישה", + "6": "שישה", + "7": "שבעה", + "8": "שמונה", + "9": "תשעה", +} + +tens_dict = { + "2": "עשרים", + "3": "שלושים", + "4": "ארבעים", + "5": "חמישים", + "6": "שישים", + "7": "שבעים", + "8": "שמונים", + "9": "תשעים", +} + +ten = {"short": "עשר", "long": "עשרה"} # double pronunciation: short is 'eser' and 'asar', long is 'esre' and 'asara' + +############# +# FUNCTIONS # +############# +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def augment_labels_with_punct_at_end(labels): + """ + augments labels: if key ends on a punctuation that value does not have, add a new label + where the value maintains the punctuation + + Args: + labels : input labels + Returns: + additional labels + """ + res = [] + for label in labels: + if len(label) > 1: + if label[0][-1] == "." and label[1][-1] != ".": + res.append([label[0], label[1] + "."] + label[2:]) + return res + + +def digit_by_digit(num): + + dbd = [' '.join([units_feminine_dict[digit] for digit in num])] + + # generate "1" as masculine and as feminine if exists + if units_feminine_dict["1"] in dbd[0]: + dbd.append(dbd[0].replace(units_feminine_dict["1"], units_masculine_dict["1"])) + + return dbd + + +def integer_to_text(num, only_fem=False): + if isinstance(num, int): + num = str(num) + # number is zero + if num == len(num) * "0": + return ['אפס'] + else: + # remove leading zeros from number + num = num.lstrip("0") + + # units + if len(num) == 1: + return _less_than_10(num, only_fem) + + # tenths + elif len(num) == 2: + return _less_than_100(num, only_fem) + + else: + raise Exception + + +def _less_than_10(num, only_fem=False): + """ + Returns a list of all the possible names of a number in range 0-9 + """ + + if only_fem: + return [units_feminine_dict[num]] + else: + return [units_feminine_dict[num], units_masculine_dict[num]] + + +def _less_than_100(num, only_fem=False): + """ + Returns a list of all the possible names of a number in range 0-99 + """ + + # init result + res = list() + + # split number to digits + tens, units = num + + # number is in range 0-9 + if len(num) == 1: + res.extend(_less_than_10(num)) + + # number is in range 10-99 + elif len(num) == 2: + + if num == "10": + if only_fem: + res.extend([ten["short"]]) + else: + res.extend([ten["long"], ten["short"]]) + + # number is in range 11-19 + elif tens == "1": + res.append(f'{units_feminine_dict[num[1]]} {ten["long"]}') + if not only_fem: + res.append(f'{units_masculine_dict[num[1]]} {ten["short"]}') + + else: + + # number is in range 20-99, a multiplication of 10 + if units == "0": + res.append(tens_dict[num[0]]) + + # number is in range 20-99, but not multiplication of 10 + else: + res.append(f'{tens_dict[num[0]]} {"ו"}{units_feminine_dict[num[1]]}') + if not only_fem: + res.append(f'{tens_dict[num[0]]} {"ו"}{units_masculine_dict[num[1]]}') + + return res diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py new file mode 100644 index 000000000..bc443be41 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py new file mode 100644 index 000000000..36b84a2fd --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py @@ -0,0 +1,84 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal in Hebrew + e.g. cardinal { prefix: "וב" integer: "3405"} -> וב-3,405 + e.g. cardinal { negative: "-" integer: "904" } -> -904 + e.g. cardinal { prefix: "כ" integer: "123" } -> כ-123 + + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + + # Need parser to group digits by threes + exactly_three_digits = NEMO_DIGIT ** 3 + at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) + + # Thousands separator + group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + + # Keep the prefix if exists and add a dash + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_ALPHA_HE, 1) + + pynutil.insert("-") + + pynutil.delete("\"") + + delete_space, + 0, + 1, + ) + + # Removes the negative attribute and leaves the sign if occurs + optional_sign = pynini.closure( + pynutil.delete("negative:") + + delete_space + + pynutil.delete("\"") + + pynini.accep("-") + + pynutil.delete("\"") + + delete_space, + 0, + 1, + ) + + # removes integer aspect + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) # Accepts at least one digit + + pynutil.delete("\"") + ) + + # Add thousands separator + graph = graph @ group_by_threes + + self.numbers = graph + + # add prefix and sign + graph = optional_prefix + optional_sign + graph + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py new file mode 100644 index 000000000..9b69d4e28 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py @@ -0,0 +1,121 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_QUOTE, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing date, + e.g. { day_prefix: "ה" day: "1" month_prefix: "ב" month: "6" year: "2012" } -> ה-1.6.2012 + """ + + def __init__(self): + super().__init__(name="date", kind="verbalize") + + day_prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.insert('-') + + pynutil.delete("\"") + ) + + day = ( + pynutil.delete("day:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1, 2) + + pynutil.insert('.') + + pynutil.delete("\"") + + delete_space + ) + + month_prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + delete_space + ) + + month = ( + pynutil.delete("month:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + year_prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 3) + + pynutil.delete("\"") + + delete_space + ) + + year = ( + pynutil.delete("year:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + ####################### + # DATE FORMATS GRAPHS # + ####################### + + # day and month only + graph_dm = ( + pynini.closure(day_prefix + delete_zero_or_one_space, 0, 1) + + day + + pynini.closure(delete_zero_or_one_space, 0, 1) + + month + + delete_zero_or_one_space + ) + + # day month and year + graph_dmy = ( + graph_dm + delete_space + pynutil.insert('.') + pynini.closure(delete_zero_or_one_space + year, 0, 1) + ) + + # only month and year + graph_my = ( + pynini.closure(month_prefix + delete_zero_or_one_space, 0, 1) + + month + + pynutil.insert(' ') + + pynini.closure(delete_zero_or_one_space + year, 0, 1) + ) + + # only year + graph_y_only = year_prefix + insert_space + year + + final_graph = (graph_dm | graph_dmy | graph_my | graph_y_only) + delete_space + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py new file mode 100644 index 000000000..c7eab357f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py @@ -0,0 +1,90 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_NOT_QUOTE, delete_space + + +class DecimalFst(GraphFst): + """ + Finite state transducer for verbalizing decimal, + e.g. decimal { integer_part: "0" fractional_part: "33" } -> 0.33 + e.g. decimal { negative: "true" integer_part: "400" fractional_part: "323" } -> -400.323 + e.g. decimal { integer_part: "4" fractional_part: "5" quantity: "מיליון" } -> 4.5 מיליון + + """ + + def __init__(self): + super().__init__(name="decimal", kind="verbalize") + optionl_sign = pynini.closure(pynini.cross("negative: \"true\"", "-") + delete_space, 0, 1) + + # Need parser to group digits by threes + exactly_three_digits = NEMO_DIGIT ** 3 + at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) + + # Thousands separator + group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + + integer = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + integer = integer @ group_by_threes + + optional_integer = pynini.closure(integer + delete_space, 0, 1) + + fractional = ( + pynutil.insert(".") + + pynutil.delete("fractional_part:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_fractional = pynini.closure(fractional + delete_space, 0, 1) + + quantity = ( + pynutil.delete("quantity:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1) + + # Keep the prefix if exists and add a dash + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_ALPHA_HE, 1) + + pynutil.insert("-") + + pynutil.delete("\"") + + delete_space, + 0, + 1, + ) + + graph = optional_prefix + optional_integer + optional_fractional + optional_quantity + self.numbers = graph + graph = optionl_sign + graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py new file mode 100644 index 000000000..b76d35324 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py @@ -0,0 +1,89 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_NOT_QUOTE, delete_space, NEMO_SPACE, NEMO_SIGMA + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing measure, in Hebrew. + Some measures are concatenated to the numbers and other are don't (two measure lists) + e.g. measure { cardinal { integer: "3" } spaced_units: "מ״ג" } -> 3 מ״ג + e.g. measure { cardinal { integer: "1000" } units: "%" } -> 1,000% + e.g. measure { units: "%" cardinal { integer: "1" } } -> 1% + e.g. measure { spaced_units: "ס״מ" cardinal { integer: "1" } } -> 1 ס״מ + e.g. measure { prefix: "ל" cardinal { integer: "4" } spaced_units: "ס״מ" } -> ל-4 ס״מ + + Args: + decimal: DecimalFst + cardinal: CardinalFst + """ + + def __init__(self, decimal: GraphFst, cardinal: GraphFst): + super().__init__(name="measure", kind="verbalize") + + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.insert('-') + + pynutil.delete("\"") + + delete_space, + 0, + 1, + ) + + # Removes the negative attribute and leaves the sign if occurs + optional_sign = pynini.closure( + pynutil.delete("negative:") + + delete_space + + pynutil.delete("\"") + + pynini.accep("-") + + pynutil.delete("\"") + + delete_space, + 0, + 1, + ) + + graph_decimal = ( + pynutil.delete("decimal {") + delete_space + decimal.numbers + delete_space + pynutil.delete("}") + ) + + graph_cardinal = ( + pynutil.delete("cardinal {") + delete_space + cardinal.numbers + delete_space + pynutil.delete("}") + ) + + unit = ( + pynutil.delete("units:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + + pynutil.delete("\"") + + delete_space + ) + unit @= pynini.cdrewrite(pynini.cross("\[SPACE\]", NEMO_SPACE), "", "", NEMO_SIGMA) # For space separated measures. + + numbers_units = delete_space + unit + numbers_graph = (graph_cardinal | graph_decimal) + numbers_units + + one_graph = delete_space + pynutil.insert("1") + unit + pynutil.delete("cardinal { integer: \"1\" }") + + graph = optional_prefix + optional_sign + (numbers_graph | one_graph) + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py new file mode 100644 index 000000000..2d7bd0832 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py @@ -0,0 +1,38 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing ordinal in Hebrew + e.g. ordinal { integer: "10" } -> 10 + """ + + def __init__(self): + super().__init__(name="ordinal", kind="verbalize") + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete("\"") + ) + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py new file mode 100644 index 000000000..6506d0e87 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py @@ -0,0 +1,122 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_QUOTE, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing time in Hebrew + e.g. time { hours: "2" minutes: "55" suffix: "בלילה" } -> 2:55 בלילה + e.g. time { hours: "2" minutes: "57" suffix: "בבוקר" } -> 2:57 בבוקר + e.g. time { prefix: "ב" hours: "6" minutes: "32" suffix: "בערב" } -> ב-18:32 בערב + e.g. time { prefix: "בשעה" hours: "2" minutes: "10" suffix: "בצהריים" } -> בשעה-14:10 בצהריים + + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + + hour_to_noon = pynini.string_map( + [("12", "12"), ("1", "13"), ("2", "14"), ("3", "15"), ("4", "16"), ("5", "17"), ("6", "18"),] + ) + + hour_to_evening = pynini.string_map( + [("5", "17"), ("6", "18"), ("7", "19"), ("8", "20"), ("9", "21"), ("10", "22"), ("11", "23"),] + ) + + hour_to_night = pynini.string_map( + [ + ("8", "20"), + ("9", "21"), + ("10", "22"), + ("11", "23"), + ("12", "0"), + ("1", "1"), + ("2", "2"), + ("3", "3"), + ("4", "4"), + ] + ) + + day_suffixes = ( + insert_space + + pynutil.delete("suffix: \"") + + (pynini.accep("בבוקר") | pynini.accep("לפנות בוקר")) + + pynutil.delete("\"") + ) + + noon_suffixes = ( + insert_space + + pynutil.delete("suffix: \"") + + (pynini.accep("בצהריים") | pynini.accep("אחרי הצהריים") | pynini.accep("אחר הצהריים")) + + pynutil.delete("\"") + ) + + evening_suffixes = ( + insert_space + + pynutil.delete("suffix: \"") + + (pynini.accep("בערב") | pynini.accep("לפנות ערב")) + + pynutil.delete("\"") + ) + + night_suffixes = insert_space + pynutil.delete("suffix: \"") + pynini.accep("בלילה") + pynutil.delete("\"") + + hour = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete("\"") + ) + + minute = ( + pynutil.delete("minutes:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete("\"") + ) + + prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.insert("-") + + pynutil.delete("\"") + ) + + optional_prefix = pynini.closure(prefix + delete_zero_or_one_space, 0, 1) + optional_suffix = pynini.closure(delete_space + day_suffixes, 0, 1) + graph = hour + delete_space + pynutil.insert(":") + minute + optional_suffix + + for hour_to, suffix in zip( + [hour_to_noon, hour_to_evening, hour_to_night], [noon_suffixes, evening_suffixes, night_suffixes] + ): + graph |= hour @ hour_to + delete_space + pynutil.insert(":") + minute + delete_space + suffix + + graph |= optional_prefix + graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py new file mode 100644 index 000000000..5a7ca62ad --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py @@ -0,0 +1,54 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.whitelist import WhiteListFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars in Hebrew. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal_graph = OrdinalFst().fst + + decimal = DecimalFst() + decimal_graph = decimal.fst + + measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst + + time_graph = TimeFst().fst + + date_graph = DateFst().fst + + whitelist_graph = WhiteListFst().fst + + graph = ( + time_graph | date_graph | measure_graph | ordinal_graph | decimal_graph | cardinal_graph | whitelist_graph + ) + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py new file mode 100644 index 000000000..db68d4318 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py @@ -0,0 +1,44 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence in Hebrew + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py new file mode 100644 index 000000000..a6d2463d0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py @@ -0,0 +1,50 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, delete_space + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for verbalizing whitelist + e.g. tokens { name: "mrs." } -> mrs. + """ + + def __init__(self): + super().__init__(name="whitelist", kind="verbalize") + # Keep the prefix if exists and add a dash + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_ALPHA_HE, 1) + + pynutil.delete("\"") + + delete_space, + 0, + 1, + ) + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete("\"") + ) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + final_graph = optional_prefix + graph + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py new file mode 100644 index 000000000..d531f44f2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py @@ -0,0 +1,34 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, delete_space + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing plain tokens + e.g. tokens { name: "sleep" } -> sleep + """ + + def __init__(self): + super().__init__(name="word", kind="verbalize") + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index c10819908..da85318b1 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -131,6 +131,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'he': # Japanese + from nemo_text_processing.inverse_text_normalization.he.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -175,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'he', 'hi', 'hy', 'mr', 'ja'], default="en", type=str, ) diff --git a/tests/nemo_text_processing/he/__init__.py b/tests/nemo_text_processing/he/__init__.py new file mode 100644 index 000000000..bc443be41 --- /dev/null +++ b/tests/nemo_text_processing/he/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..cfb6f8db0 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,138 @@ +אפס~אפס +מינוס שלוש~-3 +עשר~עשר +שלוש עשרה~13 +שלושה עשר~13 +עשרים~20 +עשרים ותשע~29 +עשרים ותשעה~29 +ארבעים~40 +מינוס ארבעים ושש~-46 +שבעים ושבעה~77 +מאה~100 +מאה ואחת~101 +מאה ועשר~110 +מאה ושש עשרה~116 +מאה עשרים~120 +מאה ועשרים~120 +כמאה עשרים וחמש~כ-125 +מאתיים~200 +מאתיים ושלוש~203 +מאתיים שלושים~230 +שלוש מאות ושלושים~330 +מינוס מאתיים שישים ושבע~-267 +ארבע מאות~400 +כחמש מאות עובדים~כ-500 עובדים +חמש מאות שבעים ותשע~579 +תשע מאות תשעים~990 +תשע מאות תשעים ותשע~999 +אלף~1,000 +אלף וארבע~1,004 +אלף עשרים ושמונה~1,028 +אלף מאה וחמש~1,105 +אלף מאה שלושים~1,130 +אלף תשע מאות תשעים ואחת~1,991 +אלפיים~2,000 +אלפיים וחמש~2,005 +אלפיים ועשר~2,010 +אלפיים ואחת עשרה~2,011 +אלפיים מאה~2,100 +אלפיים מאתיים~2,200 +מינוס אלפיים מאתיים עשרים ושתיים~-2,222 +אלפיים שלוש מאות~2,300 +אלפיים ארבע מאות ושבע~2,407 +מינוס אלפיים ארבע מאות שבעים~-2,470 +מינוס אלפיים ארבע מאות שבעים וחמש~-2,475 +שלושת אלפים~3,000 +שלושת אלפים וחמש~3,005 +שלושת אלפים ועשר~3,010 +שלושת אלפים וארבע עשרה~3,014 +שלושת אלפים מאה~3,100 +שלושת אלפים מאתיים~3,200 +מינוס שלושת אלפים שבע מאות עשרים ואחת~-3,721 +שלושת אלפים שמונה מאות~3,800 +שלושת אלפים ושמונה מאות~3,800 +שלושת אלפים תשע מאות ושבע~3,907 +מינוס שלושת אלפים מאתיים ועשרים~-3,220 +חמשת אלפים~5,000 +תשעת אלפים תשע מאות תשעים ותשע~9,999 +עשרת אלפים~10,000 +עשרת אלפים ואחת~10,001 +עשרת אלפים וחמש עשרה~10,015 +עשרת אלפים ועשרים~10,020 +עשרת אלפים עשרים ושלוש~10,023 +עשרת אלפים מאתיים~10,200 +עשרת אלפים מאתיים ואחד~10,201 +עשרת אלפים מאתיים ארבעים~10,240 +עשרת אלפים מאתיים וארבעים~10,240 +עשרת אלפים שלוש מאות חמישים~10,350 +עשרת אלפים שלוש מאות וחמישים~10,350 +שתיים עשרה אלף שש מאות~12,600 +שתיים עשרה אלף ושש מאות~12,600 +שתיים עשרה אלף שש מאות ואחת~12,601 +כשמונים ושבע אלף ועשר~כ-87,010 +תשעים ותשע אלף תשע מאות תשעים ותשע~99,999 +מאה אלף~100,000 +כמאה אלף תושבים~כ-100,000 תושבים +מאה אלף ושלוש~100,003 +מאה אלף ושתיים עשרה~100,012 +מאה אלף וארבעים~100,040 +מאה אלף ארבעים ושבע~100,047 +מאה אלף וארבעים ושבע~100,047 +מאה אלף ומאה~100,100 +מאה אלף מאה~100,100 +מאה אלף מאה שלושים ושלוש~100,133 +מאה ואחד אלף~101,000 +מאה ואחד אלף ואחת~101,001 +מאה ואחד אלף ועשר~101,010 +מאה ואחד אלף ואחת עשרה~101,011 +מאה ואחד אלף מאתיים~101,200 +כמאה ואחד אלף ומאתיים~כ-101,200 +מינוס מאה ואחת אלף מאתיים ועשרים~-101,220 +מינוס מאה ואחת אלף מאתיים עשרים~-101,220 +מינוס מאה ואחת אלף מאתיים עשרים ותשע~-101,229 +מינוס מאה ואחת אלף מאתיים עשרים ותשע~-101,229 +מאה ושתיים אלף~102,000 +מאה ושלוש אלף חמש מאות~103,500 +מאה ושלוש אלף וחמש מאות~103,500 +מאה וארבע אלף חמש מאות וארבע~104,504 +מאתיים ארבעים אלף~240,000 +מאתיים וארבעים אלף~240,000 +מאתיים חמישים וחמש אלף ושש~255,006 +מאתיים חמישים וחמש אלף וארבע מאות ושש~255,406 +מאתיים חמישים וחמש אלף ארבע מאות ושש~255,406 +חמש מאות חמישים וחמש אלף~555,000 +תשע מאות תשעים ותשע אלף תשע מאות תשעים ותשע~999,999 +מיליון~1,000,000 +מיליון ואחת~1,000,001 +מיליון ועשר~1,000,010 +מיליון חמש עשרה~1,000,015 +מיליון ושבעים~1,000,070 +מיליון שבעים~1,000,070 +מיליון ארבע מאות~1,000,400 +מיליון וארבע מאות~1,000,400 +מיליון ארבע מאות עשרים~1,000,420 +מיליון ארבע מאות ועשרים~1,000,420 +מיליון אלף~1,001,000 +מיליון שלושת אלפים~1,003,000 +מיליון ואלף~1,001,000 +מיליון אלף ואחת~1,001,001 +שלושה מיליון אלף~3,001,000 +שלושה מיליוןאלף וחמש~3,001,005 +שלושה מיליון ארבעים ושלוש אלף~3,043,000 +שלושה מיליון ארבעים ושלוש אלף ואחת~3,043,001 +שלושה מיליון ארבעים ושלוש אלף ושישים ואחת~3,043,061 +שלושה מיליון ארבעים ושלוש אלף שישים ואחת~3,043,061 +שלושה מיליון חמש מאות ארבעים ושלוש אלף~3,543,000 +שלושה מיליון חמש מאות ארבעים ושלוש אלף ושבע~3,543,007 +מינוס שלושה מיליון חמש מאות ארבעים ושלוש אלף ושבע~-3,543,007 +עשר מיליון~10 מיליון +עשרה מיליון~10 מיליון +עשרים מיליון~20 מיליון +חמש עשרה מיליון~15 מיליון +שלוש עשרה מיליון ארבעים ושלוש אלף~13,043,000 +מאה מיליון~100 מיליון +מאה עשרים ושתיים מיליון~122 מיליון +מאה עשרים ושתיים מיליון ושלוש עשרה~122,000,013 +מאה עשרים ושתיים מיליון חמישים אלף ושלוש עשרה~122,050,013 +שלוש אלף~3,000 diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..96b745de4 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,29 @@ +אחד במאי אלף תשע מאות שמונים ושלוש~1.5.1983 +השתיים עשרה לשתיים עשרה אלף תשע מאות תשעים ואחת~ה-12.12.1991 +השתיים עשרה בדצמבר אלף תשע מאות תשעים ואחת~ה-12.12.1991 +בינואר עשרים עשרים ואחת~בינואר 2021 +בשלישי לשלישי אלף תשע מאות תשעים~ב-3.3.1990 +העשירי באפריל~ה-10.4 +אחד במאי~1.5 +הראשון לחמישי~הראשון לחמישי +יוני אלפיים וחמש עשרה~יוני 2015 +ביוני אלפיים וחמש עשרה~ביוני 2015 +מתחיל בספטמבר עשרים עשרים~מתחיל בספטמבר 2020 +בשבעה עשר באוגוסט~ב-17.8 +בשבעה עשר באוגוסט עשרים שלושים~ב-17.8.2030 +בשבעה עשר לשמיני עשרים שלושים~ב-17.8.2030 +עשרים ושישי לרביעי עשרים עשרים וארבע~26.4.2024 +עשרים ושש לרביעי עשרים עשרים וארבע~26.4.2024 +עשרים ושישי לאפריל עשרים עשרים וארבע~26.4.2024 +עשרים ושש באפריל עשרים עשרים וארבע~26.4.2024 +עשרים ושישי לרביעי עשרים וארבע~26.4.24 +עשרים ושש לרביעי עשרים וארבע~26.4.24 +עשרים ושישי לאפריל עשרים וארבע~26.4.24 +עשרים ושש באפריל עשרים וארבע~26.4.24 +עשרים ושישה באפריל עשרים וארבע~26.4.24 +אנשים לא ידעו אחד מהשני~אנשים לא ידעו אחד מהשני +בשבעה באוקטובר~ב-7.10 +בשנת אלפיים וחמש~בשנת 2005 +משנת עשרים עשרים ואחת~משנת 2021 +השנה אלפיים ושלוש~השנה 2003 +שנת אלפיים וארבע~שנת 2004 \ No newline at end of file diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..968f2ef81 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,64 @@ +חמש נקודה שתיים מיליון~5.2 מיליון +מאה שישים וארבע נקודה חמישים ושמונה אלף~164.58 אלף +ארבע מאות מיליון~400 מיליון +חמישים מיליארד~50 מיליארד +ארבע מאות וחמש מיליארד~405 מיליארד +ארבע נקודה שמונים וחמש מיליארד~4.85 מיליארד +מאה מיליארד~100 מיליארד +מאה ועשר מיליארד~110 מיליארד +מאה שלושים ושתיים מיליארד~132 מיליארד +אחד נקודה שמונים וארבע מיליארד~1.84 מיליארד +אחד נקודה שמונים ואחת מיליארד~1.81 מיליארד +אחד נקודה חמש תשע מיליארד~1.59 מיליארד +אחד נקודה ארבע חמש שלוש מיליארד~1.453 מיליארד +אחד נקודה שבעים ושתיים מיליארד~1.72 מיליארד +אחד נקודה שתיים חמש מיליארד~1.25 מיליארד +שלוש עשרה מיליארד~13 מיליארד +שלושים מיליארד~30 מיליארד +אלפיים שמונה מאות וחמש נקודה שמונה שבע שלוש מיליון~2,805.873 מיליון +עשרה מיליון~10 מיליון +עשר מיליון~10 מיליון +חמש מיליון~5 מיליון +חמש מאות מיליון~500 מיליון +שתיים עשרה מיליון~12 מיליון +שניים עשר מיליון~12 מיליון +שלוש עשרה מיליון~13 מיליון +ארבע מיליון~4 מיליון +ארבעים וחמש מיליון~45 מיליון +חמש עשרה מיליארד~15 מיליארד +שני מיליון~2 מיליון +שתי מיליון~2 מיליון +שמונה מיליון~8 מיליון +מינוס שישים נקודה שתיים ארבע אפס אפס~-60.2400 +אפס נקודה עשרים ושש~0.26 +אפס נקודה שתיים שש~0.26 +שישים נקודה שתיים~60.2 +שמונה עשרה נקודה שמונים וחמש~18.85 +שמונה עשרה נקודה חמש אפס~18.50 +שמונה עשרה נקודה חמישים ושש~18.56 +שמונה עשרה נקודה תשע~18.9 +שמונה עשרה נקודה אפס חמש~18.05 +שמונה עשרה נקודה שתיים עשרה~18.12 +שמונה עשרה נקודה אפס אחד~18.01 +שמונה עשרה נקודה אפס אפס אפס~18.000 +שמונה עשרה נקודה שש~18.6 +שמונה עשרה נקודה שלוש אפס אפס~18.300 +שמונה עשרה נקודה שלושים ושש~18.36 +שמונה עשרה נקודה שתיים חמש~18.25 +שמונה עשרה נקודה עשרים ושתיים~18.22 +שמונה מאות ושמונה עשרה נקודה שלוש אפס שלוש~818.303 +שמונה מאות ושמונה נקודה שמונה~808.8 +שמונה מאות ושמונה נקודה אפס~808.0 +שמונה מאות שמונים ושמונה נקודה אחד~888.1 +שמונה מאות שמונים וארבע נקודה שלוש~884.3 +שמונה מאות שמונים ושתיים נקודה שמונה~882.8 +שמונה מאות שמונים ושתיים נקודה אפס~882.0 +שמונה מאות ארבעים וחמש נקודה תשעים וארבע~845.94 +שבעים ותשע וחצי~79.5 +שתיים ורבע~שתיים ורבע +שלוש ועשירית~3.1 +מינוס שלוש וחצי~-3.5 +עשר ושתי עשיריות~10.2 +שתיים ושלושת רבעי~2.75 +שתיים עשרה אלף ושתיים עשרה נקודה שתיים עשרה~12,012.12 +שתים עשרה אלף ושתים עשרה נקודה שתים עשרה~12,012.12 diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..43995e498 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt @@ -0,0 +1,9 @@ +מינוס חמש עשרה אחוז~-15% +חמש עשרה אחוז~15% +שתיים עשרה נקודה חמש מעלות~12.5° +שתיים עשרה נקודה חמש מעלות צלסיוס~12.5°C +אלף אחוזים~1,000% +אחוז אחד~1% +מאתיים חמישים גרם~250 ג׳ +סנטימטר אחד~1 ס״מ +שלוש מיליגרם~3 מ״ג diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..0f6464445 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,34 @@ +בשעה חמש בצהריים~בשעה 17:00 בצהריים +בחמש בצהריים~ב-17:00 בצהריים +רבע לשש בבוקר~5:45 בבוקר +בתשע בבוקר~ב-9:00 בבוקר +השעה עשרים וחמישה לאחת בצהריים~השעה 12:35 בצהריים +נפגשנו באחת ושתי דקות בצהריים~נפגשנו ב-13:02 בצהריים +נפגשנו באחת ושלוש דקות בצהריים~נפגשנו ב-13:03 בצהריים +נפגשנו באחת וחמישה בצהריים~נפגשנו ב-13:05 בצהריים +שתיים ועשרה בבוקר~2:10 בבוקר +בשעה שתיים ועשרה בצהריים~בשעה 14:10 בצהריים +בשתיים ועשרה אחרי הצהריים~ב-14:10 אחרי הצהריים +שלוש ודקה בצהריים~15:01 בצהריים +ארבע ושלוש דקות אחרי הצהריים~16:03 אחרי הצהריים +שש ועשרים דקות בערב~18:20 בערב +בשש וחצי בערב~ב-18:30 בערב +חמישה לשלוש בבוקר~2:55 בבוקר +רבע לשש בערב~17:45 בערב +שלוש בצהריים~15:00 בצהריים +אחת לפנות בוקר~1:00 לפנות בוקר +אתמול בחמש אחרי הצהריים יצאנו עם אמא למכולת ובדרך ראינו שהגן שלנו סגור~אתמול ב-17:00 אחרי הצהריים יצאנו עם אמא למכולת ובדרך ראינו שהגן שלנו סגור +חמישה לחצות~23:55 +ברבע לחצות~ב-23:45 +בשעה חצות ועשרה~בשעה 0:10 +בחצות ודקה~ב-0:01 +חצות ושתיים עשרה דקות~0:12 +שלוש דקות לחצות~23:57 +חצות ושתי דקות~0:02 +חצות~0:00 +דקה לשלוש בצהריים~14:59 בצהריים +הפגישה זזה משבע בבוקר לשמונה וחצי בבוקר~הפגישה זזה מ-7:00 בבוקר ל-8:30 בבוקר +באחת בלילה~ב-1:00 בלילה +חמש לפנות ערב~17:00 לפנות ערב +בשלוש לפנות בוקר~ב-3:00 לפנות בוקר +עשרים לחמש אחרי הצהריים~16:40 אחרי הצהריים \ No newline at end of file diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..67e4d6560 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,4 @@ +בשנת שבעים לפני הספירה~בשנת 70 לפנה״ס +יש מאתיים חמישים עורכי דין חדשים~יש 250 עו״ד חדשים +ישראל היא המדינה החמישים ואחת של ארצות הברית~ישראל היא המדינה ה-51 של ארה״ב +דוקטור שמילוביץ רשם לי תרופה חדשה~ד״ר שמילוביץ רשם לי תרופה חדשה \ No newline at end of file diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_full_sentences.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_full_sentences.txt new file mode 100644 index 000000000..1bf28b0fb --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_full_sentences.txt @@ -0,0 +1,56 @@ +אתמול בשעה שבע וחצי בבוקר היה לי תור לרופא~אתמול בשעה 7:30 בבוקר היה לי תור לרופא +הגעתי למרפאה בשבע ורבע בבוקר כדי לא לאחר~הגעתי למרפאה ב-7:15 בבוקר כדי לא לאחר +אמרתי לרופא שאני בן חמישים ושלוש שאני נשוי ויש לי שלושה ילדים.~אמרתי לרופא שאני בן 53 שאני נשוי ויש לי שלושה ילדים. +אמרתי לו שיש לי כאבים ביד, אז הוא בדק אותי ~אמרתי לו שיש לי כאבים ביד, אז הוא בדק אותי +הוא אמר שיש חשד לשבר באמה של בין שני סנטימטר לארבע סנטימטר ושנצטרך לעשות צילום כדי לדעת~הוא אמר שיש חשד לשבר באמה של בין 2 ס״מ ל-4 ס״מ ושנצטרך לעשות צילום כדי לדעת +בינתיים הוא רשם לי עשר מיליגרם של משככי כאבים~בינתיים הוא רשם לי 10 מ״ג של משככי כאבים +הוא אמר שזה מאוד נפוץ ושליותר מעשר אחוז מהאוכלוסיה יש את זה~הוא אמר שזה מאוד נפוץ ושליותר מ-10% מהאוכלוסיה יש את זה +הוא העריך את סיכויי ההחלמה בשמונים ושלוש נקודה שש אחוז~הוא העריך את סיכויי ההחלמה ב-83.6% +בסוף המפגש הוא קבע ביקורת לשמיני באוגוסט~בסוף המפגש הוא קבע ביקורת ל-8.8 +יש לי חמישה תפוחים~יש לי חמישה תפוחים +אף אחד לא רוצה~אף אחד לא רוצה +בכל כיתה יש עשרים, עשרים ושתיים תלמידים~בכל כיתה יש 20 , 22 תלמידים +בכל כיתה יש עשרים - עשרים ושתיים תלמידים~בכל כיתה יש 20 - 22 תלמידים +אחת עשרה אלף שבע מאות חמישים ושש~11,756 +ע"פ הנתונים החדשים שקיבלנו הייתה עלייה של שלושים נקודה שתיים עשרה אחוז במכירות~ע"פ הנתונים החדשים שקיבלנו הייתה עלייה של 30.12% במכירות +אני בטוח בזה במאה אחוז~אני בטוח בזה ב-100% +יש לזה שלושים ותשע נקודה שישים ושבע אחוז הצלחה~יש לזה 39.67% הצלחה +יהי אפסילון אפס ויהי איקס~יהי אפסילון אפס ויהי איקס +לשתינו יש שתי בנות~לשתינו יש שתי בנות +היום יום שני ומחר יום שלישי~היום יום שני ומחר יום שלישי +שלוש וחצי קילוגרם~3.5 ק״ג +חמש ורבע סנטימטר~5.25 ס״מ +שמונה ושלושת רבעי~8.75 +שמונה ורבע מיליון~8.25 מיליון +שתיים וחצי~שתיים וחצי +שתיים וחצי מיליון~2.5 מיליון +בשתיים וחצי מיליון~ב-2.5 מיליון +שתיים וחצי בבוקר~2:30 בבוקר +מינוס שלוש וחצי אחוז~-3.5% +שלוש וחצי~שלוש וחצי +עשרת אלפים ומאתיים ארבעים~10,240 +אפס מאופס~אפס מאופס +הוא מתנהג כמו אפס.... בקיצור כל עניין האפס~הוא מתנהג כמו אפס.... בקיצור כל עניין האפס +מאה שישים וארבע נקודה חמישים ושמונה אלף~164.58 אלף +הפגישה זזה משבע וחצי בבוקר לשמונה~הפגישה זזה מ-7:30 בבוקר לשמונה +על סמך זה יצאנו ביום ראשון~על סמך זה יצאנו ביום ראשון +צעירים היו בגיל שלושים שלושים וחמש~צעירים היו בגיל 30 35 +אולי שניים שלושה~אולי שניים שלושה +בן הראשון שלי שנולד~בן הראשון שלי שנולד +אנחנו היינו איזה חמישה עשר איש~אנחנו היינו איזה 15 איש +התחילו לחזור וחזרו אחד אחד~התחילו לחזור וחזרו אחד אחד +וזה היה כבר אולי שעה תשע~וזה היה כבר אולי שעה תשע +אני מדבר על שמונה עשר באפריל~אני מדבר על 18.4 +שמונה עשר בינואר~18.1 +הייתה נראית כעת שתיים עשרה שלוש עשרה~הייתה נראית כעת 12 13 +היה בערך בעשירי בעשירי למאי~היה בערך בעשירי ב-10.5 +באמצע הלילה שתיים בלילה~באמצע הלילה 2:00 בלילה +למחרת בשעה חמש~למחרת בשעה חמש +בשנת אלף תשע מאות ארבעים ושמונה~בשנת 1948 +באלף תשע מאות ארבעים ושמונה~ב-1,948 +ארבע מאות וחמישים מיליגרם~450 מ״ג +ארבע וחצי~ארבע וחצי +יהי אפסילון אפס ויהי איקס~יהי אפסילון אפס ויהי איקס +לשתינו יש שתי בנות~לשתינו יש שתי בנות +מחר בשש וחצי בבוקר נעלה על האוטובוסים ונצא לטיול השנתי. בשעה שמונה נגיע למצדה ונתחיל לטפס למעלה, נהיה שם עד אחת וחצי בצהריים, אולי רבע לשתיים, ונרד בשביל הנחש~מחר ב-6:30 בבוקר נעלה על האוטובוסים ונצא לטיול השנתי. בשעה שמונה נגיע למצדה ונתחיל לטפס למעלה, נהיה שם עד 13:30 בצהריים , אולי 1:45 , ונרד בשביל הנחש +יש לי חמישה תפוחים~יש לי חמישה תפוחים \ No newline at end of file diff --git a/tests/nemo_text_processing/he/test_cardinal.py b/tests/nemo_text_processing/he/test_cardinal.py new file mode 100644 index 000000000..4700725b1 --- /dev/null +++ b/tests/nemo_text_processing/he/test_cardinal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_date.py b/tests/nemo_text_processing/he/test_date.py new file mode 100644 index 000000000..73c183e7b --- /dev/null +++ b/tests/nemo_text_processing/he/test_date.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDate: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_decimal.py b/tests/nemo_text_processing/he/test_decimal.py new file mode 100644 index 000000000..125fc31d0 --- /dev/null +++ b/tests/nemo_text_processing/he/test_decimal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_full_sentences.py b/tests/nemo_text_processing/he/test_full_sentences.py new file mode 100644 index 000000000..8eba0c7db --- /dev/null +++ b/tests/nemo_text_processing/he/test_full_sentences.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestFullSentences: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_full_sentences.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_measure.py b/tests/nemo_text_processing/he/test_measure.py new file mode 100644 index 000000000..1649effa7 --- /dev/null +++ b/tests/nemo_text_processing/he/test_measure.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMeasure: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/he/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..bce2e24b9 --- /dev/null +++ b/tests/nemo_text_processing/he/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,61 @@ +#! /bin/sh + +PROJECT_DIR=/workspace/tests + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testITNDate() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + +testITNDecimal() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + + +testITNTime() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNMeasure() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_measure.txt + runtest $input +} + + +testITNWhitelist() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + + +# Load shUnit2 +. $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/he/test_time.py b/tests/nemo_text_processing/he/test_time.py new file mode 100644 index 000000000..f3bba67b5 --- /dev/null +++ b/tests/nemo_text_processing/he/test_time.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + inverse_normalizer_en = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_whitelist.py b/tests/nemo_text_processing/he/test_whitelist.py new file mode 100644 index 000000000..fb14c2a58 --- /dev/null +++ b/tests/nemo_text_processing/he/test_whitelist.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6b82dfbec..846973eee 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -101,6 +101,7 @@ def parse_args(): 'ar', 'it', 'es_en', + 'he', 'hi', 'hy', 'mr', @@ -283,6 +284,13 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.mr.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + elif args.language == 'he': + from nemo_text_processing.inverse_text_normalization.he.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) elif args.language == 'hy': from nemo_text_processing.inverse_text_normalization.hy.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst, @@ -312,6 +320,8 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + else: + raise KeyError(f"Language {args.language} is not defined for export.") output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir, From b0d7cb3800bf4215c4e0fe5de1f806e820b3400b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Sep 2025 00:06:09 +0000 Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../he/graph_utils.py | 4 +-- .../he/taggers/cardinal.py | 26 +++++-------------- .../he/taggers/date.py | 16 ++++++------ .../he/taggers/decimal.py | 14 +++++----- .../he/taggers/time.py | 13 ++++++++-- .../he/taggers/whitelist.py | 7 ++--- .../inverse_text_normalization/he/utils.py | 5 ++-- .../he/verbalizers/cardinal.py | 2 +- .../he/verbalizers/decimal.py | 2 +- .../he/verbalizers/measure.py | 12 +++++++-- .../he/verbalizers/time.py | 20 ++++++++++++-- .../he/verbalizers/whitelist.py | 2 +- .../he/verbalizers/word.py | 2 +- 13 files changed, 72 insertions(+), 53 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/he/graph_utils.py b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py index 7d96154a9..adabc1445 100644 --- a/nemo_text_processing/inverse_text_normalization/he/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py @@ -41,7 +41,7 @@ def string_map_cased(input_file: str): def apply_fst(text, fst): - """ Given a string input, returns the output string + """Given a string input, returns the output string produced by traversing the path with lowest weight. If no valid path accepts input string, returns an error. @@ -118,4 +118,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike': + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py index 49d3a26ec..fee2b8895 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py @@ -22,13 +22,12 @@ delete_optional_and, ) from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path - from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, NEMO_SPACE, - insert_space, delete_space, + insert_space, ) from nemo_text_processing.text_normalization.en.utils import load_labels @@ -52,9 +51,7 @@ def __init__(self): graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_ties += pynini.union(delete_space + delete_and + graph_digit, pynutil.insert("0", weight=0.001)) - graph_two_digit = pynini.union( - graph_teen, graph_ties - ) + graph_two_digit = pynini.union(graph_teen, graph_ties) self.graph_two_digit = graph_two_digit | graph_digit # hundreds @@ -72,9 +69,7 @@ def __init__(self): pynutil.insert("00", weight=0.001), ) graph_hundred = pynini.union( - graph_hundred, - pynutil.insert("0") + graph_two_digit, - pynutil.insert("00") + graph_digit + graph_hundred, pynutil.insert("0") + graph_two_digit, pynutil.insert("00") + graph_digit ) self.graph_hundred = graph_hundred @ ( @@ -87,16 +82,12 @@ def __init__(self): delete_thousand = pynutil.delete("אלפים") | pynutil.delete("אלף", weight=0.001) large_number_prefix = pynini.union( - graph_hundred, - pynutil.insert("0") + graph_two_digit, - pynutil.insert("00") + thousand_digit + graph_hundred, pynutil.insert("0") + graph_two_digit, pynutil.insert("00") + thousand_digit ) many_thousands = large_number_prefix + delete_space + delete_thousand graph_thousands = delete_optional_and + pynini.union( - (pynutil.insert("00") + thousand), - many_thousands, - pynutil.insert("000", weight=0.001) + (pynutil.insert("00") + thousand), many_thousands, pynutil.insert("000", weight=0.001) ) self.graph_thousands = pynini.union(graph_thousands + delete_space + graph_hundred, graph_zero) @@ -113,14 +104,11 @@ def __init__(self): graph_millions = pynini.union(many_millions, million, pynutil.insert("000", weight=0.001)) graph = pynini.union( - graph_millions + delete_space + graph_thousands + delete_space + graph_hundred, - graph_zero + graph_millions + delete_space + graph_thousands + delete_space + graph_hundred, graph_zero ) graph = graph @ pynini.union( - pynutil.delete(pynini.closure("0")) - + pynini.difference(NEMO_DIGIT, "0") - + pynini.closure(NEMO_DIGIT), "0" + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" ) labels_exception = load_labels(get_abs_path("data/numbers/digit.tsv")) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py index 3dae17f2b..2e337ed88 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py @@ -17,11 +17,7 @@ from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import ( - delete_extra_space, - delete_space, - insert_space, -) +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, insert_space def _get_year_graph(graph_two_digits, graph_thousands): @@ -66,7 +62,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): month_name2number_graph = pynutil.insert("month: \"") + month_name2number + pynutil.insert("\"") month_number2number = pynini.string_file(get_abs_path("data/months_ordinal2number.tsv")) - month_number2number_graph = (pynutil.insert("month: \"") + month_number2number + pynutil.insert("\"")) + month_number2number_graph = pynutil.insert("month: \"") + month_number2number + pynutil.insert("\"") all_month_graph = month_name2number_graph | month_number2number_graph @@ -77,8 +73,12 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): delete_prefix = pynutil.delete(prefix_graph) graph_prefix = pynutil.insert("morphosyntactic_features: \"") + prefix_graph + pynutil.insert("\"") - year_prefix_graph = pynutil.insert("morphosyntactic_features: \"") + pynini.closure(prefix_graph, 0, 1) + pynini.union("שנה", "שנת") + pynutil.insert("\"") - + year_prefix_graph = ( + pynutil.insert("morphosyntactic_features: \"") + + pynini.closure(prefix_graph, 0, 1) + + pynini.union("שנה", "שנת") + + pynutil.insert("\"") + ) graph_dm = ( pynini.closure(graph_prefix + insert_space, 0, 1) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py index 4866ba1bc..fdd8ac836 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py @@ -15,11 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - MINUS, - GraphFst, - delete_and, -) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import MINUS, GraphFst, delete_and from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, @@ -36,7 +32,7 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL e.g. one million -> integer_part: "1" quantity: "million" e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ @@ -107,7 +103,11 @@ def __init__(self, cardinal: GraphFst): point = pynutil.delete("נקודה") graph_negative = pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space - optional_graph_negative = pynini.closure(graph_negative, 0, 1,) + optional_graph_negative = pynini.closure( + graph_negative, + 0, + 1, + ) graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py index bf9bf39a1..310a4427d 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py @@ -49,7 +49,9 @@ def __init__(self): suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv")) time_prefix = pynini.string_file(get_abs_path("data/prefix.tsv")) - time_prefix_graph = pynutil.insert("morphosyntactic_features: \"") + time_prefix + pynutil.insert("\"") + insert_space + time_prefix_graph = ( + pynutil.insert("morphosyntactic_features: \"") + time_prefix + pynutil.insert("\"") + insert_space + ) optional_time_prefix_graph = pynini.closure(time_prefix_graph, 0, 1) @@ -67,7 +69,14 @@ def __init__(self): ) graph_minute_to_verbose = pynini.string_map( - [("רבע", "45"), ("עשרה", "50"), ("חמישה", "55"), ("עשרים", "40"), ("עשרים וחמישה", "35"), ("דקה", "59"),] + [ + ("רבע", "45"), + ("עשרה", "50"), + ("חמישה", "55"), + ("עשרים", "40"), + ("עשרים וחמישה", "35"), + ("דקה", "59"), + ] ) # only used for < 1000 thousand -> 0 weight diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py index 4cce7ebee..b1cfa22bd 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py @@ -17,12 +17,9 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - GraphFst, - string_map_cased, -) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, string_map_cased from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import insert_space, convert_space +from nemo_text_processing.text_normalization.en.graph_utils import convert_space, insert_space class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/he/utils.py b/nemo_text_processing/inverse_text_normalization/he/utils.py index 90a53e521..30e9e3e2f 100644 --- a/nemo_text_processing/inverse_text_normalization/he/utils.py +++ b/nemo_text_processing/inverse_text_normalization/he/utils.py @@ -56,6 +56,7 @@ ten = {"short": "עשר", "long": "עשרה"} # double pronunciation: short is 'eser' and 'asar', long is 'esre' and 'asara' + ############# # FUNCTIONS # ############# @@ -65,7 +66,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -73,7 +74,7 @@ def get_abs_path(rel_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py index 36b84a2fd..d4232b2f3 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py @@ -32,7 +32,7 @@ def __init__(self): super().__init__(name="cardinal", kind="verbalize") # Need parser to group digits by threes - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # Thousands separator diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py index c7eab357f..8fcd388b3 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py @@ -33,7 +33,7 @@ def __init__(self): optionl_sign = pynini.closure(pynini.cross("negative: \"true\"", "-") + delete_space, 0, 1) # Need parser to group digits by threes - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # Thousands separator diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py index b76d35324..6c080d910 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py @@ -16,7 +16,13 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_NOT_QUOTE, delete_space, NEMO_SPACE, NEMO_SIGMA +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_CHAR, + NEMO_NOT_QUOTE, + NEMO_SIGMA, + NEMO_SPACE, + delete_space, +) class MeasureFst(GraphFst): @@ -77,7 +83,9 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst): + pynutil.delete("\"") + delete_space ) - unit @= pynini.cdrewrite(pynini.cross("\[SPACE\]", NEMO_SPACE), "", "", NEMO_SIGMA) # For space separated measures. + unit @= pynini.cdrewrite( + pynini.cross("\[SPACE\]", NEMO_SPACE), "", "", NEMO_SIGMA + ) # For space separated measures. numbers_units = delete_space + unit numbers_graph = (graph_cardinal | graph_decimal) + numbers_units diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py index 6506d0e87..465c240dc 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py @@ -39,11 +39,27 @@ def __init__(self): super().__init__(name="time", kind="verbalize") hour_to_noon = pynini.string_map( - [("12", "12"), ("1", "13"), ("2", "14"), ("3", "15"), ("4", "16"), ("5", "17"), ("6", "18"),] + [ + ("12", "12"), + ("1", "13"), + ("2", "14"), + ("3", "15"), + ("4", "16"), + ("5", "17"), + ("6", "18"), + ] ) hour_to_evening = pynini.string_map( - [("5", "17"), ("6", "18"), ("7", "19"), ("8", "20"), ("9", "21"), ("10", "22"), ("11", "23"),] + [ + ("5", "17"), + ("6", "18"), + ("7", "19"), + ("8", "20"), + ("9", "21"), + ("10", "22"), + ("11", "23"), + ] ) hour_to_night = pynini.string_map( diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py index a6d2463d0..b41bf2951 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py @@ -45,6 +45,6 @@ def __init__(self): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) final_graph = optional_prefix + graph self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py index d531f44f2..a2ea163d2 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py @@ -29,6 +29,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() From bd7696fa36e75edbcfd1c128d3ccca008a97647d Mon Sep 17 00:00:00 2001 From: tbartley94 Date: Thu, 25 Sep 2025 13:50:45 -0700 Subject: [PATCH 3/9] rebase Signed-off-by: tbartley94 --- .../he/taggers/cardinal.py | 59 +++++++-- .../he/taggers/date.py | 30 +++-- .../he/taggers/decimal.py | 91 ++++++++++---- .../he/taggers/measure.py | 57 ++++++--- .../he/taggers/ordinal.py | 10 +- .../he/taggers/punctuation.py | 8 +- .../he/taggers/time.py | 113 +++++++++++------- .../he/taggers/tokenize_and_classify.py | 66 +++++++--- .../he/taggers/whitelist.py | 11 +- .../he/taggers/word.py | 14 ++- .../he/verbalizers/cardinal.py | 27 +++-- 11 files changed, 351 insertions(+), 135 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py index fee2b8895..3dd822f48 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py @@ -90,18 +90,27 @@ def __init__(self): (pynutil.insert("00") + thousand), many_thousands, pynutil.insert("000", weight=0.001) ) - self.graph_thousands = pynini.union(graph_thousands + delete_space + graph_hundred, graph_zero) + self.graph_thousands = pynini.union( + graph_thousands + delete_space + graph_hundred, graph_zero + ) self.graph_thousands @= pynini.union( - pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" + pynutil.delete(pynini.closure("0")) + + pynini.difference(NEMO_DIGIT, "0") + + pynini.closure(NEMO_DIGIT), + "0", ) # millions million = pynini.string_map([("מיליון", "001")]) - delete_millions = pynutil.delete("מיליונים") | pynutil.delete("מיליון", weight=0.001) + delete_millions = pynutil.delete("מיליונים") | pynutil.delete( + "מיליון", weight=0.001 + ) many_millions = large_number_prefix + delete_space + delete_millions - graph_millions = pynini.union(many_millions, million, pynutil.insert("000", weight=0.001)) + graph_millions = pynini.union( + many_millions, million, pynutil.insert("000", weight=0.001) + ) graph = pynini.union( graph_millions + delete_space + graph_thousands + delete_space + graph_hundred, graph_zero @@ -112,7 +121,9 @@ def __init__(self): ) labels_exception = load_labels(get_abs_path("data/numbers/digit.tsv")) - labels_exception = list(set([x[0] for x in labels_exception] + ["אפס", "עשר", "עשרה"])) + labels_exception = list( + set([x[0] for x in labels_exception] + ["אפס", "עשר", "עשרה"]) + ) labels_exception += ["ו" + label for label in labels_exception] graph_exception = pynini.union(*labels_exception).optimize() @@ -121,29 +132,51 @@ def __init__(self): self.graph_no_exception = graph ### Token insertion - minus_graph = pynutil.insert("negative: ") + pynini.cross("מינוס", "\"-\"") + NEMO_SPACE + minus_graph = ( + pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE + ) optional_minus_graph = pynini.closure(minus_graph, 0, 1) optional_prefix_graph = pynini.closure( - pynutil.insert("morphosyntactic_features: \"") + prefix_graph + pynutil.insert("\"") + insert_space, 0, 1 + pynutil.insert('morphosyntactic_features: "') + + prefix_graph + + pynutil.insert('"') + + insert_space, + 0, + 1, ) - graph_wo_small_digits = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph + graph_wo_small_digits = ( + pynini.project(graph, "input") - graph_exception.arcsort() + ) @ graph - cardinal_wo_viable_hours = load_labels(get_abs_path("data/numbers/viable_hours.tsv")) + cardinal_wo_viable_hours = load_labels( + get_abs_path("data/numbers/viable_hours.tsv") + ) cardinal_wo_viable_hours = list(set([x[0] for x in cardinal_wo_viable_hours])) viable_hours_exception = pynini.union(*cardinal_wo_viable_hours).optimize() - self.graph_wo_viable_hours = (pynini.project(graph, "input") - viable_hours_exception.arcsort()) @ graph + self.graph_wo_viable_hours = ( + pynini.project(graph, "input") - viable_hours_exception.arcsort() + ) @ graph small_number_with_minus = ( - insert_space + minus_graph + pynutil.insert("integer: \"") + self.graph_no_exception + pynutil.insert("\"") + insert_space + + minus_graph + + pynutil.insert('integer: "') + + self.graph_no_exception + + pynutil.insert('"') ) big_number_with_optional_minus = ( - optional_minus_graph + pynutil.insert("integer: \"") + graph_wo_small_digits + pynutil.insert("\"") + optional_minus_graph + + pynutil.insert('integer: "') + + graph_wo_small_digits + + pynutil.insert('"') ) - graph = optional_prefix_graph + (small_number_with_minus | big_number_with_optional_minus) + graph = optional_prefix_graph + ( + small_number_with_minus | big_number_with_optional_minus + ) final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py index 2e337ed88..2a3db1907 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py @@ -25,7 +25,8 @@ def _get_year_graph(graph_two_digits, graph_thousands): Transducer for year, e.g. twenty twenty -> 2020 """ year_graph = pynini.union( - (graph_two_digits + delete_space + graph_two_digits), graph_thousands # 20 19, 40 12, 20 20 + (graph_two_digits + delete_space + graph_two_digits), + graph_thousands, # 20 19, 40 12, 20 20 ) # 2012 - assuming no limit on the year year_graph.optimize() @@ -53,13 +54,19 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): two_digits_graph = cardinal.graph_two_digit day_graph = pynutil.add_weight(two_digits_graph | ordinal_graph, -0.7) - day_graph = pynutil.insert("day: \"") + day_graph + pynutil.insert("\"") + day_graph = pynutil.insert('day: "') + day_graph + pynutil.insert('"') month_names = pynini.string_file(get_abs_path("data/months.tsv")) - month_names_graph = pynutil.insert("month: \"") + month_names + pynutil.insert("\"") + month_names_graph = ( + pynutil.insert('month: "') + month_names + pynutil.insert('"') + ) - month_name2number = pynini.string_file(get_abs_path("data/months_name2number.tsv")) - month_name2number_graph = pynutil.insert("month: \"") + month_name2number + pynutil.insert("\"") + month_name2number = pynini.string_file( + get_abs_path("data/months_name2number.tsv") + ) + month_name2number_graph = ( + pynutil.insert('month: "') + month_name2number + pynutil.insert('"') + ) month_number2number = pynini.string_file(get_abs_path("data/months_ordinal2number.tsv")) month_number2number_graph = pynutil.insert("month: \"") + month_number2number + pynutil.insert("\"") @@ -67,7 +74,12 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): all_month_graph = month_name2number_graph | month_number2number_graph year_graph = _get_year_graph(two_digits_graph, cardinal.graph_thousands) - graph_year = delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\"") + graph_year = ( + delete_extra_space + + pynutil.insert('year: "') + + year_graph + + pynutil.insert('"') + ) prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) delete_prefix = pynutil.delete(prefix_graph) @@ -99,7 +111,11 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): + graph_year ) - graph_my = pynini.closure(graph_prefix + insert_space, 0, 1) + month_names_graph + graph_year + graph_my = ( + pynini.closure(graph_prefix + insert_space, 0, 1) + + month_names_graph + + graph_year + ) graph_y_only = year_prefix_graph + graph_year final_graph = graph_dm | graph_dmy | graph_my | graph_y_only diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py index fdd8ac836..ef9d5b625 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py @@ -26,7 +26,9 @@ ) -def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike': +def get_quantity( + decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike" +) -> "pynini.FstLike": """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral in Hebrew, e.g. one million -> integer_part: "1" quantity: "million" @@ -37,7 +39,9 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL cardinal_up_to_hundred: cardinal FST """ numbers = cardinal_up_to_hundred @ ( - pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + pynutil.delete(pynini.closure("0")) + + pynini.difference(NEMO_DIGIT, "0") + + pynini.closure(NEMO_DIGIT) ) suffix_labels = ["אלף", "מיליון", "מיליארד"] @@ -45,15 +49,21 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL suffix = pynini.union(*suffix_labels).optimize() res = ( - pynutil.insert("integer_part: \"") + pynutil.insert('integer_part: "') + numbers - + pynutil.insert("\"") + + pynutil.insert('"') + delete_extra_space - + pynutil.insert("quantity: \"") + + pynutil.insert('quantity: "') + suffix - + pynutil.insert("\"") + + pynutil.insert('"') + ) + res |= ( + decimal + + delete_extra_space + + pynutil.insert('quantity: "') + + (suffix | "אלף") + + pynutil.insert('"') ) - res |= decimal + delete_extra_space + pynutil.insert("quantity: \"") + (suffix | "אלף") + pynutil.insert("\"") return res @@ -75,26 +85,42 @@ def __init__(self, cardinal: GraphFst): prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) optional_prefix_graph = pynini.closure( - pynutil.insert("morphosyntactic_features: \"") + prefix_graph + pynutil.insert("\"") + insert_space, 0, 1 + pynutil.insert('morphosyntactic_features: "') + + prefix_graph + + pynutil.insert('"') + + insert_space, + 0, + 1, ) # all cardinals cardinal_graph = cardinal.graph_no_exception # all fractions - fractions = pynini.string_file(get_abs_path("data/numbers/decimal_fractions.tsv")) + fractions = pynini.string_file( + get_abs_path("data/numbers/decimal_fractions.tsv") + ) fractions_graph = delete_zero_or_one_space + delete_and + fractions - fractions_graph = pynutil.insert("fractional_part: \"") + fractions_graph + pynutil.insert("\"") + fractions_graph = ( + pynutil.insert('fractional_part: "') + fractions_graph + pynutil.insert('"') + ) # identify decimals that can be understood time, and don't convert them to avoid ambiguity viable_minutes_verbose = ["חצי", "רבע"] viable_minutes_exception = pynini.union(*viable_minutes_verbose).optimize() - fractions_wo_minutes = (pynini.project(fractions, "input") - viable_minutes_exception.arcsort()) @ fractions - fractions_wo_minutes = delete_zero_or_one_space + delete_and + fractions_wo_minutes - fractions_wo_minutes = pynutil.insert("fractional_part: \"") + fractions_wo_minutes + pynutil.insert("\"") + fractions_wo_minutes = ( + pynini.project(fractions, "input") - viable_minutes_exception.arcsort() + ) @ fractions + fractions_wo_minutes = ( + delete_zero_or_one_space + delete_and + fractions_wo_minutes + ) + fractions_wo_minutes = ( + pynutil.insert('fractional_part: "') + + fractions_wo_minutes + + pynutil.insert('"') + ) - graph_decimal = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - graph_decimal |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_decimal = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_decimal |= cardinal.graph_two_digit graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal @@ -109,38 +135,55 @@ def __init__(self, cardinal: GraphFst): 1, ) - graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") - graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") + graph_integer = ( + pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + ) + graph_fractional = ( + pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"') + ) # integer could be an hour, but minutes cannot: convert to decimal - viable_hour_unviable_minutes = graph_integer + delete_extra_space + fractions_wo_minutes + viable_hour_unviable_minutes = ( + graph_integer + delete_extra_space + fractions_wo_minutes + ) # integer cannot be an hour, but minutes can: convert to decimal unviable_hour_viable_minutes = ( - pynutil.insert("integer_part: \"") + pynutil.insert('integer_part: "') + cardinal.graph_wo_viable_hours - + pynutil.insert("\"") + + pynutil.insert('"') + delete_extra_space + fractions_graph ) # minus sign followed by ambiguous decimal: convert to decimal, there is no negative time - negative_viable_time = graph_negative + graph_integer + delete_extra_space + fractions_graph + negative_viable_time = ( + graph_negative + graph_integer + delete_extra_space + fractions_graph + ) # all decimals with fractions, not excluding anything (used in other FSTs) all_decimals_wo_point = graph_integer + delete_extra_space + fractions_graph # only cases with fractional part that cannot be interpreted as time - graph_wo_point = viable_hour_unviable_minutes | unviable_hour_viable_minutes | negative_viable_time + graph_wo_point = ( + viable_hour_unviable_minutes + | unviable_hour_viable_minutes + | negative_viable_time + ) # all decimals with the word "point" graph_w_point = ( - pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional + pynini.closure(graph_integer + delete_extra_space, 0, 1) + + point + + delete_extra_space + + graph_fractional ) final_graph_wo_sign = graph_w_point | graph_wo_point self.final_graph_wo_sign = graph_w_point | all_decimals_wo_point - final_graph = optional_prefix_graph + optional_graph_negative + final_graph_wo_sign + final_graph = ( + optional_prefix_graph + optional_graph_negative + final_graph_wo_sign + ) quantity_graph = get_quantity(self.final_graph_wo_sign, cardinal.graph_hundred) final_graph |= optional_prefix_graph + optional_graph_negative + quantity_graph diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py index 31c0a7286..20109f425 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py @@ -15,10 +15,18 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst -from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + GraphFst, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import ( + CardinalFst, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import ( + DecimalFst, +) +from nemo_text_processing.inverse_text_normalization.he.utils import ( + get_abs_path, +) from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_SPACE, delete_extra_space, @@ -48,12 +56,19 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): # optional negative sign optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("מינוס", "\"-\"") + NEMO_SPACE, 0, 1 + pynutil.insert("code_switch: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE, + 0, + 1, ) prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) optional_prefix_graph = pynini.closure( - pynutil.insert("morphosyntactic_features: \"") + prefix_graph + pynutil.insert("\"") + insert_space, 0, 1 + pynutil.insert('morphosyntactic_features: "') + + prefix_graph + + pynutil.insert('"') + + insert_space, + 0, + 1, ) # cardinal numbers @@ -61,14 +76,17 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): # Let singular apply to values > 1 as they could be part of an adjective phrase (e.g. 14 foot tall building) subgraph_decimal = ( - pynutil.insert("decimal { ") + decimal.final_graph_wo_sign + pynutil.insert(" }") + delete_extra_space + pynutil.insert("decimal { ") + + decimal.final_graph_wo_sign + + pynutil.insert(" }") + + delete_extra_space ) subgraph_cardinal = ( pynutil.insert("cardinal { ") - + pynutil.insert("integer: \"") + + pynutil.insert('integer: "') + cardinal_graph - + pynutil.insert("\"") + + pynutil.insert('"') + pynutil.insert(" }") + delete_extra_space ) @@ -76,11 +94,13 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): # convert units joined_units = pynini.string_file(get_abs_path("data/measurements.tsv")) joined_units = pynini.invert(joined_units) - joined_units = pynutil.insert("units: \"") + joined_units + pynutil.insert("\"") + joined_units = pynutil.insert('units: "') + joined_units + pynutil.insert('"') spaced_units = pynini.string_file(get_abs_path("data/spaced_measurements.tsv")) spaced_units = pynini.invert(spaced_units) - spaced_units = pynutil.insert("units: \"\[SPACE\]") + spaced_units + pynutil.insert("\"") + spaced_units = ( + pynutil.insert('units: "\[SPACE\]') + spaced_units + pynutil.insert('"'). # noqa: W605 + ) # in joint units the unit is concatenated to the number, in spaced unit separate the unit with a space units_graph = joined_units | spaced_units @@ -91,15 +111,22 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): one_graph = ( insert_space + pynutil.insert("cardinal { ") - + pynutil.insert("integer: \"") + + pynutil.insert('integer: "') + one - + pynutil.insert("\"") + + pynutil.insert('"') + pynutil.insert(" }") ) number_graph = subgraph_decimal | subgraph_cardinal - number_unit_graph = (number_graph + units_graph) | (units_graph + delete_space + one_graph) + number_unit_graph = (number_graph + units_graph) | ( + units_graph + delete_space + one_graph + ) - final_graph = optional_prefix_graph + optional_graph_negative + number_unit_graph + delete_zero_or_one_space + final_graph = ( + optional_prefix_graph + + optional_graph_negative + + number_unit_graph + + delete_zero_or_one_space + ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py index f1205de22..01d1ca7bc 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py @@ -15,8 +15,12 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + GraphFst, +) +from nemo_text_processing.inverse_text_normalization.he.utils import ( + get_abs_path, +) from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA @@ -38,6 +42,6 @@ def __init__(self, cardinal: GraphFst): self.graph = graph @ cardinal_graph - final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") + final_graph = pynutil.insert('integer: "') + self.graph + pynutil.insert('"') final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py index 8dae89220..b7f344f13 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py @@ -15,7 +15,9 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + GraphFst, +) class PunctuationFst(GraphFst): @@ -27,9 +29,9 @@ class PunctuationFst(GraphFst): def __init__(self): super().__init__(name="punctuation", kind="classify") - s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~" + s = "!#$%&'()*+,-./:;<=>?@^_`{|}~" punct = pynini.union(*s) - graph = pynutil.insert("name: \"") + punct + pynutil.insert("\"") + graph = pynutil.insert('name: "') + punct + pynutil.insert('"') self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py index 310a4427d..b8b1a7a4e 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py @@ -15,9 +15,17 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, delete_and -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path, integer_to_text +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + GraphFst, + delete_and, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import ( + CardinalFst, +) +from nemo_text_processing.inverse_text_normalization.he.utils import ( + get_abs_path, + integer_to_text, +) from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, delete_extra_space, @@ -83,21 +91,31 @@ def __init__(self): cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [integer_to_text(x, only_fem=True)[0] for x in range(1, 13)] - labels_minute_single = [integer_to_text(x, only_fem=True)[0] for x in range(2, 10)] - labels_minute_double = [integer_to_text(x, only_fem=True)[0] for x in range(10, 60)] + labels_minute_single = [ + integer_to_text(x, only_fem=True)[0] for x in range(2, 10) + ] + labels_minute_double = [ + integer_to_text(x, only_fem=True)[0] for x in range(10, 60) + ] midnight = pynini.string_map([("חצות", "0")]) graph_hour = pynini.union(*labels_hour) @ cardinal graph_hour |= midnight add_leading_zero_to_double_digit = pynutil.insert("0") + NEMO_DIGIT - graph_minute_single = pynini.union(*labels_minute_single) @ cardinal @ add_leading_zero_to_double_digit + graph_minute_single = ( + pynini.union(*labels_minute_single) + @ cardinal + @ add_leading_zero_to_double_digit + ) graph_minute_double = pynini.union(*labels_minute_double) @ cardinal - final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"") + final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"') - graph_minute = pynini.union(pynutil.insert("00"), graph_minute_single, graph_minute_double) + graph_minute = pynini.union( + pynutil.insert("00"), graph_minute_single, graph_minute_double + ) - final_suffix = pynutil.insert("suffix: \"") + suffix_graph + pynutil.insert("\"") + final_suffix = pynutil.insert('suffix: "') + suffix_graph + pynutil.insert('"') final_suffix = delete_space + insert_space + final_suffix graph_h_and_m = ( @@ -105,35 +123,37 @@ def __init__(self): + delete_space + delete_and + insert_space - + pynutil.insert("minutes: \"") - + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose) - + pynutil.insert("\"") + + pynutil.insert('minutes: "') + + pynini.union( + graph_minute_single, graph_minute_double, graph_minute_verbose + ) + + pynutil.insert('"') + (pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1)) ) graph_special_m_to_h_suffix_time = ( - pynutil.insert("minutes: \"") + pynutil.insert('minutes: "') + graph_minute_to_verbose - + pynutil.insert("\"") + + pynutil.insert('"') + delete_space + pynutil.delete("ל") + insert_space - + pynutil.insert("hours: \"") + + pynutil.insert('hours: "') + to_hour_graph - + pynutil.insert("\"") + + pynutil.insert('"') ) graph_m_to_h_suffix_time = ( - pynutil.insert("minutes: \"") + pynutil.insert('minutes: "') + pynini.union(graph_minute_single, graph_minute_double) @ minute_to_graph - + pynutil.insert("\"") + + pynutil.insert('"') + pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1) + delete_space + pynutil.delete("ל") + insert_space - + pynutil.insert("hours: \"") + + pynutil.insert('hours: "') + to_hour_graph - + pynutil.insert("\"") + + pynutil.insert('"') ) graph_h = ( @@ -141,72 +161,83 @@ def __init__(self): + delete_zero_or_one_space + final_graph_hour + delete_extra_space - + pynutil.insert("minutes: \"") + + pynutil.insert('minutes: "') + (pynutil.insert("00") | graph_minute) - + pynutil.insert("\"") + + pynutil.insert('"') + final_suffix ) midnight_graph = ( optional_time_prefix_graph + delete_zero_or_one_space - + pynutil.insert("hours: \"") + + pynutil.insert('hours: "') + midnight - + pynutil.insert("\"") + + pynutil.insert('"') + insert_space - + pynutil.insert("minutes: \"") + + pynutil.insert('minutes: "') + (pynutil.insert("00") | graph_minute) - + pynutil.insert("\"") + + pynutil.insert('"') ) graph_midnight_and_m = ( - pynutil.insert("hours: \"") + pynutil.insert('hours: "') + midnight - + pynutil.insert("\"") + + pynutil.insert('"') + delete_space + delete_and + insert_space - + pynutil.insert("minutes: \"") - + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose) - + pynutil.insert("\"") + + pynutil.insert('minutes: "') + + pynini.union( + graph_minute_single, graph_minute_double, graph_minute_verbose + ) + + pynutil.insert('"') + (pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1)) ) to_midnight_verbose_graph = ( - pynutil.insert("minutes: \"") + pynutil.insert('minutes: "') + graph_minute_to_verbose - + pynutil.insert("\"") + + pynutil.insert('"') + delete_space + pynutil.delete("ל") + insert_space - + pynutil.insert("hours: \"") + + pynutil.insert('hours: "') + to_hour_graph - + pynutil.insert("\"") + + pynutil.insert('"') ) graph_m_to_midnight = ( - pynutil.insert("minutes: \"") + pynutil.insert('minutes: "') + pynini.union(graph_minute_single, graph_minute_double) @ minute_to_graph - + pynutil.insert("\"") + + pynutil.insert('"') + pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1) + delete_space + pynutil.delete("ל") + insert_space - + pynutil.insert("hours: \"") + + pynutil.insert('hours: "') + to_hour_graph - + pynutil.insert("\"") + + pynutil.insert('"') ) final_graph_midnight = ( optional_time_prefix_graph + delete_zero_or_one_space - + (midnight_graph | to_midnight_verbose_graph | graph_m_to_midnight | graph_midnight_and_m) + + ( + midnight_graph + | to_midnight_verbose_graph + | graph_m_to_midnight + | graph_midnight_and_m + ) ) final_graph = ( optional_time_prefix_graph + delete_zero_or_one_space - + (graph_h_and_m | graph_special_m_to_h_suffix_time | graph_m_to_h_suffix_time) + + ( + graph_h_and_m + | graph_special_m_to_h_suffix_time + | graph_m_to_h_suffix_time + ) + final_suffix ) final_graph |= graph_h diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py index 397525a44..3cb899150 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py @@ -18,17 +18,41 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.date import DateFst -from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.measure import MeasureFst -from nemo_text_processing.inverse_text_normalization.he.taggers.ordinal import OrdinalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.punctuation import PunctuationFst -from nemo_text_processing.inverse_text_normalization.he.taggers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.he.taggers.whitelist import WhiteListFst -from nemo_text_processing.inverse_text_normalization.he.taggers.word import WordFst -from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, generator_main +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + GraphFst, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import ( + CardinalFst, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.date import ( + DateFst, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import ( + DecimalFst, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.measure import ( + MeasureFst, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.ordinal import ( + OrdinalFst, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.punctuation import ( + PunctuationFst, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.time import ( + TimeFst, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.whitelist import ( + WhiteListFst, +) +from nemo_text_processing.inverse_text_normalization.he.taggers.word import ( + WordFst, +) +from nemo_text_processing.text_normalization.en.graph_utils import ( + delete_extra_space, + delete_space, + generator_main, +) class ClassifyFst(GraphFst): @@ -44,7 +68,11 @@ class ClassifyFst(GraphFst): """ def __init__( - self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None, input_case: str = None + self, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + input_case: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify") @@ -85,13 +113,21 @@ def __init__( # NOTE: we convert ordinals in Hebrew only if it is a part of a date! this is why it is missing. ) - punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + punct = ( + pynutil.insert("tokens { ") + + pynutil.add_weight(punct_graph, weight=1.1) + + pynutil.insert(" }") + ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + pynini.closure(punct + pynutil.insert(" ")) + + token + + pynini.closure(pynutil.insert(" ") + punct) ) - graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = token_plus_punct + pynini.closure( + delete_extra_space + token_plus_punct + ) graph = delete_space + graph + delete_space self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py index b1cfa22bd..b1f43bbe5 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py @@ -45,9 +45,16 @@ def __init__(self, input_file: str = None): raise ValueError(f"Whitelist file {input_file} not found") optional_prefix_graph = pynini.closure( - pynutil.insert("morphosyntactic_features: \"") + prefix_graph + pynutil.insert("\"") + insert_space, 0, 1 + pynutil.insert('morphosyntactic_features: "') + + prefix_graph + + pynutil.insert('"') + + insert_space, + 0, + 1, ) whitelist = string_map_cased(input_file) - graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"") + graph = ( + pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"') + ) final_graph = optional_prefix_graph + graph self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py index 142036ace..090e56072 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py @@ -15,8 +15,12 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + GraphFst, +) +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_SPACE, +) class WordFst(GraphFst): @@ -27,5 +31,9 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + word = ( + pynutil.insert('name: "') + + pynini.closure(NEMO_NOT_SPACE, 1) + + pynutil.insert('"') + ) self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py index d4232b2f3..e90cc31f2 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py @@ -15,8 +15,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + NEMO_ALPHA_HE, + GraphFst, +) +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + delete_space, +) class CardinalFst(GraphFst): @@ -36,16 +42,19 @@ def __init__(self): at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # Thousands separator - group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + group_by_threes = ( + at_most_three_digits + + (pynutil.insert(",") + exactly_three_digits).closure() + ) # Keep the prefix if exists and add a dash optional_prefix = pynini.closure( pynutil.delete("morphosyntactic_features:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_ALPHA_HE, 1) + pynutil.insert("-") - + pynutil.delete("\"") + + pynutil.delete('"') + delete_space, 0, 1, @@ -55,9 +64,9 @@ def __init__(self): optional_sign = pynini.closure( pynutil.delete("negative:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.accep("-") - + pynutil.delete("\"") + + pynutil.delete('"') + delete_space, 0, 1, @@ -67,9 +76,9 @@ def __init__(self): graph = ( pynutil.delete("integer:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_DIGIT, 1) # Accepts at least one digit - + pynutil.delete("\"") + + pynutil.delete('"') ) # Add thousands separator From b0a57e070e39d26651e42a4b1415e9bc906d1117 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Sep 2025 20:52:24 +0000 Subject: [PATCH 4/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../he/taggers/cardinal.py | 56 ++++---------- .../he/taggers/date.py | 25 ++----- .../he/taggers/decimal.py | 74 ++++--------------- .../he/taggers/measure.py | 16 +--- .../he/taggers/ordinal.py | 8 +- .../he/taggers/punctuation.py | 4 +- .../he/taggers/time.py | 53 +++---------- .../he/taggers/tokenize_and_classify.py | 60 ++++----------- .../he/taggers/whitelist.py | 9 +-- .../he/taggers/word.py | 14 +--- .../he/verbalizers/cardinal.py | 15 +--- 11 files changed, 74 insertions(+), 260 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py index 3dd822f48..5a7122cb2 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py @@ -90,27 +90,19 @@ def __init__(self): (pynutil.insert("00") + thousand), many_thousands, pynutil.insert("000", weight=0.001) ) - self.graph_thousands = pynini.union( - graph_thousands + delete_space + graph_hundred, graph_zero - ) + self.graph_thousands = pynini.union(graph_thousands + delete_space + graph_hundred, graph_zero) self.graph_thousands @= pynini.union( - pynutil.delete(pynini.closure("0")) - + pynini.difference(NEMO_DIGIT, "0") - + pynini.closure(NEMO_DIGIT), + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0", ) # millions million = pynini.string_map([("מיליון", "001")]) - delete_millions = pynutil.delete("מיליונים") | pynutil.delete( - "מיליון", weight=0.001 - ) + delete_millions = pynutil.delete("מיליונים") | pynutil.delete("מיליון", weight=0.001) many_millions = large_number_prefix + delete_space + delete_millions - graph_millions = pynini.union( - many_millions, million, pynutil.insert("000", weight=0.001) - ) + graph_millions = pynini.union(many_millions, million, pynutil.insert("000", weight=0.001)) graph = pynini.union( graph_millions + delete_space + graph_thousands + delete_space + graph_hundred, graph_zero @@ -121,9 +113,7 @@ def __init__(self): ) labels_exception = load_labels(get_abs_path("data/numbers/digit.tsv")) - labels_exception = list( - set([x[0] for x in labels_exception] + ["אפס", "עשר", "עשרה"]) - ) + labels_exception = list(set([x[0] for x in labels_exception] + ["אפס", "עשר", "עשרה"])) labels_exception += ["ו" + label for label in labels_exception] graph_exception = pynini.union(*labels_exception).optimize() @@ -132,51 +122,31 @@ def __init__(self): self.graph_no_exception = graph ### Token insertion - minus_graph = ( - pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE - ) + minus_graph = pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE optional_minus_graph = pynini.closure(minus_graph, 0, 1) optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') - + prefix_graph - + pynutil.insert('"') - + insert_space, + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, 0, 1, ) - graph_wo_small_digits = ( - pynini.project(graph, "input") - graph_exception.arcsort() - ) @ graph + graph_wo_small_digits = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph - cardinal_wo_viable_hours = load_labels( - get_abs_path("data/numbers/viable_hours.tsv") - ) + cardinal_wo_viable_hours = load_labels(get_abs_path("data/numbers/viable_hours.tsv")) cardinal_wo_viable_hours = list(set([x[0] for x in cardinal_wo_viable_hours])) viable_hours_exception = pynini.union(*cardinal_wo_viable_hours).optimize() - self.graph_wo_viable_hours = ( - pynini.project(graph, "input") - viable_hours_exception.arcsort() - ) @ graph + self.graph_wo_viable_hours = (pynini.project(graph, "input") - viable_hours_exception.arcsort()) @ graph small_number_with_minus = ( - insert_space - + minus_graph - + pynutil.insert('integer: "') - + self.graph_no_exception - + pynutil.insert('"') + insert_space + minus_graph + pynutil.insert('integer: "') + self.graph_no_exception + pynutil.insert('"') ) big_number_with_optional_minus = ( - optional_minus_graph - + pynutil.insert('integer: "') - + graph_wo_small_digits - + pynutil.insert('"') + optional_minus_graph + pynutil.insert('integer: "') + graph_wo_small_digits + pynutil.insert('"') ) - graph = optional_prefix_graph + ( - small_number_with_minus | big_number_with_optional_minus - ) + graph = optional_prefix_graph + (small_number_with_minus | big_number_with_optional_minus) final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py index 2a3db1907..6ada20210 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py @@ -57,16 +57,10 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): day_graph = pynutil.insert('day: "') + day_graph + pynutil.insert('"') month_names = pynini.string_file(get_abs_path("data/months.tsv")) - month_names_graph = ( - pynutil.insert('month: "') + month_names + pynutil.insert('"') - ) + month_names_graph = pynutil.insert('month: "') + month_names + pynutil.insert('"') - month_name2number = pynini.string_file( - get_abs_path("data/months_name2number.tsv") - ) - month_name2number_graph = ( - pynutil.insert('month: "') + month_name2number + pynutil.insert('"') - ) + month_name2number = pynini.string_file(get_abs_path("data/months_name2number.tsv")) + month_name2number_graph = pynutil.insert('month: "') + month_name2number + pynutil.insert('"') month_number2number = pynini.string_file(get_abs_path("data/months_ordinal2number.tsv")) month_number2number_graph = pynutil.insert("month: \"") + month_number2number + pynutil.insert("\"") @@ -74,12 +68,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): all_month_graph = month_name2number_graph | month_number2number_graph year_graph = _get_year_graph(two_digits_graph, cardinal.graph_thousands) - graph_year = ( - delete_extra_space - + pynutil.insert('year: "') - + year_graph - + pynutil.insert('"') - ) + graph_year = delete_extra_space + pynutil.insert('year: "') + year_graph + pynutil.insert('"') prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) delete_prefix = pynutil.delete(prefix_graph) @@ -111,11 +100,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): + graph_year ) - graph_my = ( - pynini.closure(graph_prefix + insert_space, 0, 1) - + month_names_graph - + graph_year - ) + graph_my = pynini.closure(graph_prefix + insert_space, 0, 1) + month_names_graph + graph_year graph_y_only = year_prefix_graph + graph_year final_graph = graph_dm | graph_dmy | graph_my | graph_y_only diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py index ef9d5b625..193f6cad2 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py @@ -26,9 +26,7 @@ ) -def get_quantity( - decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike" -) -> "pynini.FstLike": +def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike": """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral in Hebrew, e.g. one million -> integer_part: "1" quantity: "million" @@ -39,9 +37,7 @@ def get_quantity( cardinal_up_to_hundred: cardinal FST """ numbers = cardinal_up_to_hundred @ ( - pynutil.delete(pynini.closure("0")) - + pynini.difference(NEMO_DIGIT, "0") - + pynini.closure(NEMO_DIGIT) + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) ) suffix_labels = ["אלף", "מיליון", "מיליארד"] @@ -57,13 +53,7 @@ def get_quantity( + suffix + pynutil.insert('"') ) - res |= ( - decimal - + delete_extra_space - + pynutil.insert('quantity: "') - + (suffix | "אלף") - + pynutil.insert('"') - ) + res |= decimal + delete_extra_space + pynutil.insert('quantity: "') + (suffix | "אלף") + pynutil.insert('"') return res @@ -85,10 +75,7 @@ def __init__(self, cardinal: GraphFst): prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') - + prefix_graph - + pynutil.insert('"') - + insert_space, + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, 0, 1, ) @@ -97,28 +84,16 @@ def __init__(self, cardinal: GraphFst): cardinal_graph = cardinal.graph_no_exception # all fractions - fractions = pynini.string_file( - get_abs_path("data/numbers/decimal_fractions.tsv") - ) + fractions = pynini.string_file(get_abs_path("data/numbers/decimal_fractions.tsv")) fractions_graph = delete_zero_or_one_space + delete_and + fractions - fractions_graph = ( - pynutil.insert('fractional_part: "') + fractions_graph + pynutil.insert('"') - ) + fractions_graph = pynutil.insert('fractional_part: "') + fractions_graph + pynutil.insert('"') # identify decimals that can be understood time, and don't convert them to avoid ambiguity viable_minutes_verbose = ["חצי", "רבע"] viable_minutes_exception = pynini.union(*viable_minutes_verbose).optimize() - fractions_wo_minutes = ( - pynini.project(fractions, "input") - viable_minutes_exception.arcsort() - ) @ fractions - fractions_wo_minutes = ( - delete_zero_or_one_space + delete_and + fractions_wo_minutes - ) - fractions_wo_minutes = ( - pynutil.insert('fractional_part: "') - + fractions_wo_minutes - + pynutil.insert('"') - ) + fractions_wo_minutes = (pynini.project(fractions, "input") - viable_minutes_exception.arcsort()) @ fractions + fractions_wo_minutes = delete_zero_or_one_space + delete_and + fractions_wo_minutes + fractions_wo_minutes = pynutil.insert('fractional_part: "') + fractions_wo_minutes + pynutil.insert('"') graph_decimal = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_decimal |= cardinal.graph_two_digit @@ -135,17 +110,11 @@ def __init__(self, cardinal: GraphFst): 1, ) - graph_integer = ( - pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') - ) - graph_fractional = ( - pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"') - ) + graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + graph_fractional = pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"') # integer could be an hour, but minutes cannot: convert to decimal - viable_hour_unviable_minutes = ( - graph_integer + delete_extra_space + fractions_wo_minutes - ) + viable_hour_unviable_minutes = graph_integer + delete_extra_space + fractions_wo_minutes # integer cannot be an hour, but minutes can: convert to decimal unviable_hour_viable_minutes = ( @@ -157,33 +126,22 @@ def __init__(self, cardinal: GraphFst): ) # minus sign followed by ambiguous decimal: convert to decimal, there is no negative time - negative_viable_time = ( - graph_negative + graph_integer + delete_extra_space + fractions_graph - ) + negative_viable_time = graph_negative + graph_integer + delete_extra_space + fractions_graph # all decimals with fractions, not excluding anything (used in other FSTs) all_decimals_wo_point = graph_integer + delete_extra_space + fractions_graph # only cases with fractional part that cannot be interpreted as time - graph_wo_point = ( - viable_hour_unviable_minutes - | unviable_hour_viable_minutes - | negative_viable_time - ) + graph_wo_point = viable_hour_unviable_minutes | unviable_hour_viable_minutes | negative_viable_time # all decimals with the word "point" graph_w_point = ( - pynini.closure(graph_integer + delete_extra_space, 0, 1) - + point - + delete_extra_space - + graph_fractional + pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional ) final_graph_wo_sign = graph_w_point | graph_wo_point self.final_graph_wo_sign = graph_w_point | all_decimals_wo_point - final_graph = ( - optional_prefix_graph + optional_graph_negative + final_graph_wo_sign - ) + final_graph = optional_prefix_graph + optional_graph_negative + final_graph_wo_sign quantity_graph = get_quantity(self.final_graph_wo_sign, cardinal.graph_hundred) final_graph |= optional_prefix_graph + optional_graph_negative + quantity_graph diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py index 20109f425..7fe07cb74 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py @@ -15,18 +15,10 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - GraphFst, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import ( - CardinalFst, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import ( - DecimalFst, -) -from nemo_text_processing.inverse_text_normalization.he.utils import ( - get_abs_path, -) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_SPACE, delete_extra_space, diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py index 01d1ca7bc..da656c33c 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py @@ -15,12 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - GraphFst, -) -from nemo_text_processing.inverse_text_normalization.he.utils import ( - get_abs_path, -) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py index b7f344f13..1f4b17e47 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py @@ -15,9 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - GraphFst, -) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst class PunctuationFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py index b8b1a7a4e..35735f2fd 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py @@ -15,17 +15,9 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - GraphFst, - delete_and, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import ( - CardinalFst, -) -from nemo_text_processing.inverse_text_normalization.he.utils import ( - get_abs_path, - integer_to_text, -) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, delete_and +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path, integer_to_text from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, delete_extra_space, @@ -91,29 +83,19 @@ def __init__(self): cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [integer_to_text(x, only_fem=True)[0] for x in range(1, 13)] - labels_minute_single = [ - integer_to_text(x, only_fem=True)[0] for x in range(2, 10) - ] - labels_minute_double = [ - integer_to_text(x, only_fem=True)[0] for x in range(10, 60) - ] + labels_minute_single = [integer_to_text(x, only_fem=True)[0] for x in range(2, 10)] + labels_minute_double = [integer_to_text(x, only_fem=True)[0] for x in range(10, 60)] midnight = pynini.string_map([("חצות", "0")]) graph_hour = pynini.union(*labels_hour) @ cardinal graph_hour |= midnight add_leading_zero_to_double_digit = pynutil.insert("0") + NEMO_DIGIT - graph_minute_single = ( - pynini.union(*labels_minute_single) - @ cardinal - @ add_leading_zero_to_double_digit - ) + graph_minute_single = pynini.union(*labels_minute_single) @ cardinal @ add_leading_zero_to_double_digit graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"') - graph_minute = pynini.union( - pynutil.insert("00"), graph_minute_single, graph_minute_double - ) + graph_minute = pynini.union(pynutil.insert("00"), graph_minute_single, graph_minute_double) final_suffix = pynutil.insert('suffix: "') + suffix_graph + pynutil.insert('"') final_suffix = delete_space + insert_space + final_suffix @@ -124,9 +106,7 @@ def __init__(self): + delete_and + insert_space + pynutil.insert('minutes: "') - + pynini.union( - graph_minute_single, graph_minute_double, graph_minute_verbose - ) + + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert('"') + (pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1)) ) @@ -187,9 +167,7 @@ def __init__(self): + delete_and + insert_space + pynutil.insert('minutes: "') - + pynini.union( - graph_minute_single, graph_minute_double, graph_minute_verbose - ) + + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert('"') + (pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1)) ) @@ -222,22 +200,13 @@ def __init__(self): final_graph_midnight = ( optional_time_prefix_graph + delete_zero_or_one_space - + ( - midnight_graph - | to_midnight_verbose_graph - | graph_m_to_midnight - | graph_midnight_and_m - ) + + (midnight_graph | to_midnight_verbose_graph | graph_m_to_midnight | graph_midnight_and_m) ) final_graph = ( optional_time_prefix_graph + delete_zero_or_one_space - + ( - graph_h_and_m - | graph_special_m_to_h_suffix_time - | graph_m_to_h_suffix_time - ) + + (graph_h_and_m | graph_special_m_to_h_suffix_time | graph_m_to_h_suffix_time) + final_suffix ) final_graph |= graph_h diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py index 3cb899150..26ee9428d 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py @@ -18,41 +18,17 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - GraphFst, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import ( - CardinalFst, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.date import ( - DateFst, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import ( - DecimalFst, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.measure import ( - MeasureFst, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.ordinal import ( - OrdinalFst, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.punctuation import ( - PunctuationFst, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.time import ( - TimeFst, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.whitelist import ( - WhiteListFst, -) -from nemo_text_processing.inverse_text_normalization.he.taggers.word import ( - WordFst, -) -from nemo_text_processing.text_normalization.en.graph_utils import ( - delete_extra_space, - delete_space, - generator_main, -) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.he.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.he.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.he.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.he.taggers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, generator_main class ClassifyFst(GraphFst): @@ -113,21 +89,13 @@ def __init__( # NOTE: we convert ordinals in Hebrew only if it is a part of a date! this is why it is missing. ) - punct = ( - pynutil.insert("tokens { ") - + pynutil.add_weight(punct_graph, weight=1.1) - + pynutil.insert(" }") - ) + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) - + token - + pynini.closure(pynutil.insert(" ") + punct) + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) - graph = token_plus_punct + pynini.closure( - delete_extra_space + token_plus_punct - ) + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py index b1f43bbe5..0395851d0 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py @@ -45,16 +45,11 @@ def __init__(self, input_file: str = None): raise ValueError(f"Whitelist file {input_file} not found") optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') - + prefix_graph - + pynutil.insert('"') - + insert_space, + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, 0, 1, ) whitelist = string_map_cased(input_file) - graph = ( - pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"') - ) + graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"') final_graph = optional_prefix_graph + graph self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py index 090e56072..5ebf1ee57 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py @@ -15,12 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - GraphFst, -) -from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_NOT_SPACE, -) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE class WordFst(GraphFst): @@ -31,9 +27,5 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = ( - pynutil.insert('name: "') - + pynini.closure(NEMO_NOT_SPACE, 1) - + pynutil.insert('"') - ) + word = pynutil.insert('name: "') + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert('"') self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py index e90cc31f2..54a19ddc1 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py @@ -15,14 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - NEMO_ALPHA_HE, - GraphFst, -) -from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, - delete_space, -) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space class CardinalFst(GraphFst): @@ -42,10 +36,7 @@ def __init__(self): at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # Thousands separator - group_by_threes = ( - at_most_three_digits - + (pynutil.insert(",") + exactly_three_digits).closure() - ) + group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() # Keep the prefix if exists and add a dash optional_prefix = pynini.closure( From 8d47f2d7279756c43450c79c95e37a950c425420 Mon Sep 17 00:00:00 2001 From: tbartley94 Date: Thu, 25 Sep 2025 14:06:53 -0700 Subject: [PATCH 5/9] rebasing Signed-off-by: tbartley94 --- .../he/taggers/cardinal.py | 40 +++++++++++++---- .../he/taggers/measure.py | 2 +- .../he/verbalizers/date.py | 5 ++- .../he/verbalizers/measure.py | 45 ++++++++++++------- 4 files changed, 65 insertions(+), 27 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py index 5a7122cb2..df43d3dd1 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py @@ -21,13 +21,16 @@ delete_and, delete_optional_and, ) -from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.inverse_text_normalization.he.utils import ( + get_abs_path, +) + from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, NEMO_SPACE, - delete_space, insert_space, + delete_space, ) from nemo_text_processing.text_normalization.en.utils import load_labels @@ -49,10 +52,15 @@ def __init__(self): # teens graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) - graph_ties += pynini.union(delete_space + delete_and + graph_digit, pynutil.insert("0", weight=0.001)) + graph_ties += pynini.union( + delete_space + delete_and + graph_digit, + pynutil.insert("0", weight=0.001), + ) graph_two_digit = pynini.union(graph_teen, graph_ties) - self.graph_two_digit = graph_two_digit | graph_digit + self.graph_two_digit = pynini.union( + graph_digit, graph_ties, pynutil.add_weight(graph_teen, -0.001) + ) # hundreds hundred = pynini.string_map([("מאה", "1"), ("מאתיים", "2")]) @@ -69,7 +77,9 @@ def __init__(self): pynutil.insert("00", weight=0.001), ) graph_hundred = pynini.union( - graph_hundred, pynutil.insert("0") + graph_two_digit, pynutil.insert("00") + graph_digit + graph_hundred, + pynutil.insert("0") + graph_two_digit, + pynutil.insert("00") + graph_digit, ) self.graph_hundred = graph_hundred @ ( @@ -82,12 +92,16 @@ def __init__(self): delete_thousand = pynutil.delete("אלפים") | pynutil.delete("אלף", weight=0.001) large_number_prefix = pynini.union( - graph_hundred, pynutil.insert("0") + graph_two_digit, pynutil.insert("00") + thousand_digit + graph_hundred, + pynutil.insert("0") + graph_two_digit, + pynutil.insert("00") + thousand_digit, ) many_thousands = large_number_prefix + delete_space + delete_thousand graph_thousands = delete_optional_and + pynini.union( - (pynutil.insert("00") + thousand), many_thousands, pynutil.insert("000", weight=0.001) + (pynutil.insert("00") + thousand), + many_thousands, + pynutil.insert("000", weight=0.001), ) self.graph_thousands = pynini.union(graph_thousands + delete_space + graph_hundred, graph_zero) @@ -105,11 +119,19 @@ def __init__(self): graph_millions = pynini.union(many_millions, million, pynutil.insert("000", weight=0.001)) graph = pynini.union( - graph_millions + delete_space + graph_thousands + delete_space + graph_hundred, graph_zero + graph_millions + + delete_space + + graph_thousands + + delete_space + + graph_hundred, + graph_zero, ) graph = graph @ pynini.union( - pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" + pynutil.delete(pynini.closure("0")) + + pynini.difference(NEMO_DIGIT, "0") + + pynini.closure(NEMO_DIGIT), + "0", ) labels_exception = load_labels(get_abs_path("data/numbers/digit.tsv")) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py index 7fe07cb74..3ebf3a420 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py @@ -91,7 +91,7 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): spaced_units = pynini.string_file(get_abs_path("data/spaced_measurements.tsv")) spaced_units = pynini.invert(spaced_units) spaced_units = ( - pynutil.insert('units: "\[SPACE\]') + spaced_units + pynutil.insert('"'). # noqa: W605 + pynutil.insert('units: "\[SPACE\]') + spaced_units + pynutil.insert('"') # noqa: W605 ) # in joint units the unit is concatenated to the number, in spaced unit separate the unit with a space diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py index 9b69d4e28..00920fdef 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py @@ -18,6 +18,7 @@ from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_NOT_QUOTE, + NEMO_SPACE, delete_space, delete_zero_or_one_space, insert_space, @@ -101,14 +102,14 @@ def __init__(self): # day month and year graph_dmy = ( - graph_dm + delete_space + pynutil.insert('.') + pynini.closure(delete_zero_or_one_space + year, 0, 1) + graph_dm + delete_space + pynutil.insert('.') + delete_zero_or_one_space + year ) # only month and year graph_my = ( pynini.closure(month_prefix + delete_zero_or_one_space, 0, 1) + month - + pynutil.insert(' ') + + pynutil.insert(NEMO_SPACE) + pynini.closure(delete_zero_or_one_space + year, 0, 1) ) diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py index 6c080d910..4040d4093 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py @@ -15,13 +15,15 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + GraphFst, +) from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_NOT_QUOTE, - NEMO_SIGMA, - NEMO_SPACE, delete_space, + NEMO_SPACE, + NEMO_SIGMA, ) @@ -46,10 +48,10 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst): optional_prefix = pynini.closure( pynutil.delete("morphosyntactic_features:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.insert('-') - + pynutil.delete("\"") + + pynutil.insert("-") + + pynutil.delete('"') + delete_space, 0, 1, @@ -57,40 +59,53 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst): # Removes the negative attribute and leaves the sign if occurs optional_sign = pynini.closure( - pynutil.delete("negative:") + pynutil.delete("code_switch:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.accep("-") - + pynutil.delete("\"") + + pynutil.delete('"') + delete_space, 0, 1, ) graph_decimal = ( - pynutil.delete("decimal {") + delete_space + decimal.numbers + delete_space + pynutil.delete("}") + pynutil.delete("decimal {") + + delete_space + + decimal.numbers + + delete_space + + pynutil.delete("}") ) graph_cardinal = ( - pynutil.delete("cardinal {") + delete_space + cardinal.numbers + delete_space + pynutil.delete("}") + pynutil.delete("cardinal {") + + delete_space + + cardinal.numbers + + delete_space + + pynutil.delete("}") ) unit = ( pynutil.delete("units:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') + delete_space ) unit @= pynini.cdrewrite( - pynini.cross("\[SPACE\]", NEMO_SPACE), "", "", NEMO_SIGMA + pynini.cross("\[SPACE\]", NEMO_SPACE), "", "", NEMO_SIGMA # noqa: W605 ) # For space separated measures. numbers_units = delete_space + unit numbers_graph = (graph_cardinal | graph_decimal) + numbers_units - one_graph = delete_space + pynutil.insert("1") + unit + pynutil.delete("cardinal { integer: \"1\" }") + one_graph = ( + delete_space + + pynutil.insert("1") + + unit + + pynutil.delete('cardinal { integer: "1" }') + ) graph = optional_prefix + optional_sign + (numbers_graph | one_graph) delete_tokens = self.delete_tokens(graph) From 14eecaa03b73f95fda1d343e0b74239014d3488b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:07:40 +0000 Subject: [PATCH 6/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../he/taggers/cardinal.py | 21 ++++----------- .../he/taggers/measure.py | 25 ++++------------- .../he/verbalizers/date.py | 4 +-- .../he/verbalizers/measure.py | 27 +++++-------------- 4 files changed, 17 insertions(+), 60 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py index df43d3dd1..38e355911 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py @@ -21,16 +21,13 @@ delete_and, delete_optional_and, ) -from nemo_text_processing.inverse_text_normalization.he.utils import ( - get_abs_path, -) - +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_DIGIT, NEMO_SIGMA, NEMO_SPACE, - insert_space, delete_space, + insert_space, ) from nemo_text_processing.text_normalization.en.utils import load_labels @@ -58,9 +55,7 @@ def __init__(self): ) graph_two_digit = pynini.union(graph_teen, graph_ties) - self.graph_two_digit = pynini.union( - graph_digit, graph_ties, pynutil.add_weight(graph_teen, -0.001) - ) + self.graph_two_digit = pynini.union(graph_digit, graph_ties, pynutil.add_weight(graph_teen, -0.001)) # hundreds hundred = pynini.string_map([("מאה", "1"), ("מאתיים", "2")]) @@ -119,18 +114,12 @@ def __init__(self): graph_millions = pynini.union(many_millions, million, pynutil.insert("000", weight=0.001)) graph = pynini.union( - graph_millions - + delete_space - + graph_thousands - + delete_space - + graph_hundred, + graph_millions + delete_space + graph_thousands + delete_space + graph_hundred, graph_zero, ) graph = graph @ pynini.union( - pynutil.delete(pynini.closure("0")) - + pynini.difference(NEMO_DIGIT, "0") - + pynini.closure(NEMO_DIGIT), + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0", ) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py index 3ebf3a420..01496127d 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py @@ -55,10 +55,7 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') - + prefix_graph - + pynutil.insert('"') - + insert_space, + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, 0, 1, ) @@ -68,10 +65,7 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): # Let singular apply to values > 1 as they could be part of an adjective phrase (e.g. 14 foot tall building) subgraph_decimal = ( - pynutil.insert("decimal { ") - + decimal.final_graph_wo_sign - + pynutil.insert(" }") - + delete_extra_space + pynutil.insert("decimal { ") + decimal.final_graph_wo_sign + pynutil.insert(" }") + delete_extra_space ) subgraph_cardinal = ( @@ -90,9 +84,7 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): spaced_units = pynini.string_file(get_abs_path("data/spaced_measurements.tsv")) spaced_units = pynini.invert(spaced_units) - spaced_units = ( - pynutil.insert('units: "\[SPACE\]') + spaced_units + pynutil.insert('"') # noqa: W605 - ) + spaced_units = pynutil.insert('units: "\[SPACE\]') + spaced_units + pynutil.insert('"') # noqa: W605 # in joint units the unit is concatenated to the number, in spaced unit separate the unit with a space units_graph = joined_units | spaced_units @@ -110,15 +102,8 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): ) number_graph = subgraph_decimal | subgraph_cardinal - number_unit_graph = (number_graph + units_graph) | ( - units_graph + delete_space + one_graph - ) + number_unit_graph = (number_graph + units_graph) | (units_graph + delete_space + one_graph) - final_graph = ( - optional_prefix_graph - + optional_graph_negative - + number_unit_graph - + delete_zero_or_one_space - ) + final_graph = optional_prefix_graph + optional_graph_negative + number_unit_graph + delete_zero_or_one_space final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py index 00920fdef..2f686f39b 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py @@ -101,9 +101,7 @@ def __init__(self): ) # day month and year - graph_dmy = ( - graph_dm + delete_space + pynutil.insert('.') + delete_zero_or_one_space + year - ) + graph_dmy = graph_dm + delete_space + pynutil.insert('.') + delete_zero_or_one_space + year # only month and year graph_my = ( diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py index 4040d4093..1c546719a 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py @@ -15,15 +15,13 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - GraphFst, -) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_CHAR, NEMO_NOT_QUOTE, - delete_space, - NEMO_SPACE, NEMO_SIGMA, + NEMO_SPACE, + delete_space, ) @@ -70,19 +68,11 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst): ) graph_decimal = ( - pynutil.delete("decimal {") - + delete_space - + decimal.numbers - + delete_space - + pynutil.delete("}") + pynutil.delete("decimal {") + delete_space + decimal.numbers + delete_space + pynutil.delete("}") ) graph_cardinal = ( - pynutil.delete("cardinal {") - + delete_space - + cardinal.numbers - + delete_space - + pynutil.delete("}") + pynutil.delete("cardinal {") + delete_space + cardinal.numbers + delete_space + pynutil.delete("}") ) unit = ( @@ -100,12 +90,7 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst): numbers_units = delete_space + unit numbers_graph = (graph_cardinal | graph_decimal) + numbers_units - one_graph = ( - delete_space - + pynutil.insert("1") - + unit - + pynutil.delete('cardinal { integer: "1" }') - ) + one_graph = delete_space + pynutil.insert("1") + unit + pynutil.delete('cardinal { integer: "1" }') graph = optional_prefix + optional_sign + (numbers_graph | one_graph) delete_tokens = self.delete_tokens(graph) From 7b10d3d7f0fffb147ab903013199c67a8b04c7eb Mon Sep 17 00:00:00 2001 From: tbartley94 Date: Mon, 3 Nov 2025 15:26:20 -0800 Subject: [PATCH 7/9] responding to formatting pr Signed-off-by: tbartley94 --- .../inverse_text_normalization/he/__init__.py | 2 +- .../he/data/numbers/__init__.py | 2 +- .../he/data/ordinals/__init__.py | 2 +- .../inverse_text_normalization/he/data/time/__init__.py | 2 +- .../inverse_text_normalization/he/graph_utils.py | 7 +++---- .../inverse_text_normalization/he/taggers/__init__.py | 2 +- .../inverse_text_normalization/he/taggers/cardinal.py | 4 ++-- .../inverse_text_normalization/he/taggers/date.py | 8 +++----- .../inverse_text_normalization/he/taggers/decimal.py | 2 +- .../inverse_text_normalization/he/taggers/measure.py | 2 +- .../inverse_text_normalization/he/taggers/ordinal.py | 2 +- .../inverse_text_normalization/he/taggers/punctuation.py | 2 +- .../inverse_text_normalization/he/taggers/time.py | 2 +- .../he/taggers/tokenize_and_classify.py | 3 +-- .../inverse_text_normalization/he/taggers/whitelist.py | 2 +- .../inverse_text_normalization/he/taggers/word.py | 2 +- .../inverse_text_normalization/he/utils.py | 2 +- .../inverse_text_normalization/he/verbalizers/__init__.py | 2 +- .../inverse_text_normalization/he/verbalizers/cardinal.py | 2 +- .../inverse_text_normalization/he/verbalizers/date.py | 2 +- .../inverse_text_normalization/he/verbalizers/decimal.py | 2 +- .../inverse_text_normalization/he/verbalizers/measure.py | 2 +- .../inverse_text_normalization/he/verbalizers/ordinal.py | 2 +- .../inverse_text_normalization/he/verbalizers/time.py | 2 +- .../he/verbalizers/verbalize.py | 2 +- .../he/verbalizers/verbalize_final.py | 2 +- .../he/verbalizers/whitelist.py | 2 +- .../inverse_text_normalization/he/verbalizers/word.py | 2 +- .../test_cases_decimal.txt | 2 ++ tests/nemo_text_processing/he/test_full_sentences.py | 2 +- 30 files changed, 36 insertions(+), 38 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/he/__init__.py b/nemo_text_processing/inverse_text_normalization/he/__init__.py index bc443be41..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/he/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/he/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py index bc443be41..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py index bc443be41..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py index bc443be41..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/graph_utils.py b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py index adabc1445..607ecd1e9 100644 --- a/nemo_text_processing/inverse_text_normalization/he/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,9 +24,8 @@ NEMO_ALPHA_HE = pynini.union(*"אבגדהוזחטיכלמםנןסעפףצץקרשת").optimize() - -delete_optional_and = pynini.closure(pynutil.delete("ו")) -delete_and = pynini.cross("ו", "") +delete_and = pynutil.delete("ו") +delete_optional_and = delete_and.ques #################### MIN_NEG_WEIGHT = -0.0001 diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py index bc443be41..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py index 38e355911..2472a9112 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -50,7 +50,7 @@ def __init__(self): graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_ties += pynini.union( - delete_space + delete_and + graph_digit, + delete_space + delete_optional_and + graph_digit, pynutil.insert("0", weight=0.001), ) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py index 6ada20210..dbd71dfe3 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -36,11 +36,9 @@ def _get_year_graph(graph_two_digits, graph_thousands): class DateFst(GraphFst): """ Finite state transducer for classifying date in Hebrew, - e.g. אחד במאי אלף תשע מאות שמונים ושלוש -> date { day: "1" month_prefix: "ב" month: "5" year: "1983" } - e.g. הראשון ביוני אלפיים ושתיים עשרה -> date { day_prefix: "ה" day: "1" month_prefix: "ב" month: "6" year: "2012" } - e.g. העשירי ביוני -> date { day_prefix: "ה" day: "10" month_prefix: "ב" month: "6" } + e.g. אחד במאי אלף תשע מאות שמונים ושלוש -> date { day: "1" morphosyntactic_features: "ב" month: "5" year: "1983" } e.g. מרץ אלף תשע מאות שמונים ותשע -> date { month: "מרץ" year: "1989" } - e.g. בינואר עשרים עשרים -> date { month_prefix: "ב" month: "ינואר" year: "2020" } + e.g. בינואר עשרים עשרים -> date { morphosyntactic_features: "ב" month: "ינואר" year: "2020" } Args: cardinal: CardinalFst diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py index 193f6cad2..2a3a319b2 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py index 01496127d..18d4d030c 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py index da656c33c..c7306ea43 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py index 1f4b17e47..b963e7b74 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py index 35735f2fd..43e07b932 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py index 26ee9428d..807dcf734 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -86,7 +86,6 @@ def __init__( | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 100) - # NOTE: we convert ordinals in Hebrew only if it is a part of a date! this is why it is missing. ) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py index 0395851d0..58de7668e 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py index 5ebf1ee57..6b5394ac3 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/utils.py b/nemo_text_processing/inverse_text_normalization/he/utils.py index 30e9e3e2f..42a89dd78 100644 --- a/nemo_text_processing/inverse_text_normalization/he/utils.py +++ b/nemo_text_processing/inverse_text_normalization/he/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py index bc443be41..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py index 54a19ddc1..d26e1f703 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py index 2f686f39b..fc6211d16 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py index 8fcd388b3..86f20f882 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py index 1c546719a..c2ae11e54 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py index 2d7bd0832..e71b76ecb 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py index 465c240dc..110952dac 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py index 5a7ca62ad..0223259db 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py index db68d4318..611181df4 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py index b41bf2951..9446614a9 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py index a2ea163d2..0083ecbd1 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt index 968f2ef81..b864e264a 100644 --- a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt @@ -62,3 +62,5 @@ שתיים ושלושת רבעי~2.75 שתיים עשרה אלף ושתיים עשרה נקודה שתיים עשרה~12,012.12 שתים עשרה אלף ושתים עשרה נקודה שתים עשרה~12,012.12 +שתיים ועשירית~2.1 +אחת ועשירית~1.1 diff --git a/tests/nemo_text_processing/he/test_full_sentences.py b/tests/nemo_text_processing/he/test_full_sentences.py index 8eba0c7db..0bc9251a7 100644 --- a/tests/nemo_text_processing/he/test_full_sentences.py +++ b/tests/nemo_text_processing/he/test_full_sentences.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From ff98fbf56bae0d89c71832aa42cdb0d8d23fc073 Mon Sep 17 00:00:00 2001 From: tbartley94 Date: Tue, 4 Nov 2025 20:19:52 -0800 Subject: [PATCH 8/9] isort and moving string map to string file Signed-off-by: tbartley94 --- .../he/data/decimals/minutes_exception.tsv | 2 + .../he/data/numbers/hundreds_exception.tsv | 2 + .../he/data/numbers/millions_exception.tsv | 1 + .../he/data/numbers/thousands_exception.tsv | 2 + .../he/data/time/day_suffix.tsv | 2 + .../he/data/time/evening_suffix.tsv | 2 + .../he/data/time/hour_to_evening.tsv | 7 + .../he/data/time/hour_to_night.tsv | 9 ++ .../he/data/time/hour_to_noon.tsv | 7 + .../he/data/time/midnight_to_hour.tsv | 1 + .../he/data/time/minute_to_verbose.tsv | 6 + .../he/data/time/minute_verbose.tsv | 8 ++ .../he/data/time/night_suffix.tsv | 1 + .../he/data/time/noon_suffix.tsv | 3 + .../he/graph_utils.py | 20 +-- .../he/taggers/cardinal.py | 111 +++++++++------ .../he/taggers/date.py | 52 +++++-- .../he/taggers/decimal.py | 108 ++++++++++----- .../he/taggers/measure.py | 52 ++++--- .../he/taggers/ordinal.py | 6 +- .../he/taggers/punctuation.py | 3 +- .../he/taggers/time.py | 129 ++++++++++-------- .../he/taggers/tokenize_and_classify.py | 47 +++++-- .../he/taggers/whitelist.py | 18 ++- .../he/taggers/word.py | 12 +- .../inverse_text_normalization/he/utils.py | 15 +- .../he/verbalizers/cardinal.py | 11 +- .../he/verbalizers/date.py | 47 ++++--- .../he/verbalizers/decimal.py | 39 ++++-- .../he/verbalizers/measure.py | 41 ++++-- .../he/verbalizers/ordinal.py | 10 +- .../he/verbalizers/time.py | 112 +++++++-------- .../he/verbalizers/verbalize.py | 32 +++-- .../he/verbalizers/verbalize_final.py | 19 ++- .../he/verbalizers/whitelist.py | 18 ++- .../he/verbalizers/word.py | 16 ++- .../test_cases_measure.txt | 1 + 37 files changed, 627 insertions(+), 345 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/decimals/minutes_exception.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/hundreds_exception.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/millions_exception.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands_exception.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/day_suffix.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/evening_suffix.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_evening.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_night.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_noon.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/midnight_to_hour.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/minute_to_verbose.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/minute_verbose.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/night_suffix.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/noon_suffix.tsv diff --git a/nemo_text_processing/inverse_text_normalization/he/data/decimals/minutes_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/decimals/minutes_exception.tsv new file mode 100644 index 000000000..5626b7100 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/decimals/minutes_exception.tsv @@ -0,0 +1,2 @@ +חצי +רבע diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/hundreds_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/hundreds_exception.tsv new file mode 100644 index 000000000..88e54ab57 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/hundreds_exception.tsv @@ -0,0 +1,2 @@ +מאה 1 +מאתיים 2 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/millions_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/millions_exception.tsv new file mode 100644 index 000000000..1443e5def --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/millions_exception.tsv @@ -0,0 +1 @@ +מיליון 1 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands_exception.tsv new file mode 100644 index 000000000..dd0c71c0d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands_exception.tsv @@ -0,0 +1,2 @@ +אלף 1 +אלפיים 2 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/day_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/day_suffix.tsv new file mode 100644 index 000000000..a4f9d2d46 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/day_suffix.tsv @@ -0,0 +1,2 @@ +בבוקר +לפנות בוקר \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/evening_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/evening_suffix.tsv new file mode 100644 index 000000000..583470a05 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/evening_suffix.tsv @@ -0,0 +1,2 @@ +בערב +לפנות ערב \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_evening.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_evening.tsv new file mode 100644 index 000000000..4fd47d1e2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_evening.tsv @@ -0,0 +1,7 @@ +5 17 +6 18 +7 19 +8 20 +9 21 +10 22 +11 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_night.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_night.tsv new file mode 100644 index 000000000..656d161b2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_night.tsv @@ -0,0 +1,9 @@ +8 20 +9 21 +10 22 +11 23 +12 0 +1 1 +2 2 +3 3 +4 4 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_noon.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_noon.tsv new file mode 100644 index 000000000..8d0de9024 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_noon.tsv @@ -0,0 +1,7 @@ +12 12 +1 13 +2 14 +3 15 +4 16 +5 17 +6 18 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/midnight_to_hour.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/midnight_to_hour.tsv new file mode 100644 index 000000000..5b86a39eb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/midnight_to_hour.tsv @@ -0,0 +1 @@ +חצות 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to_verbose.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to_verbose.tsv new file mode 100644 index 000000000..8f62ae4de --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to_verbose.tsv @@ -0,0 +1,6 @@ +רבע 45 +עשרה 50 +חמישה 55 +עשרים 40 +עשרים וחמישה 35 +דקה 59 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/minute_verbose.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_verbose.tsv new file mode 100644 index 000000000..efa2207c3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_verbose.tsv @@ -0,0 +1,8 @@ +שלושת רבעי 45 +חצי 30 +רבע 15 +עשרים 20 +עשרה 10 +חמישה 05 +דקה 01 +שתי 02 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/night_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/night_suffix.tsv new file mode 100644 index 000000000..464aa81c0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/night_suffix.tsv @@ -0,0 +1 @@ +בלילה \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/noon_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/noon_suffix.tsv new file mode 100644 index 000000000..963d81053 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/noon_suffix.tsv @@ -0,0 +1,3 @@ +בצהריים +אחרי הצהריים +אחר הצהריים \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/graph_utils.py b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py index 607ecd1e9..02642ea3d 100644 --- a/nemo_text_processing/inverse_text_normalization/he/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py @@ -19,10 +19,10 @@ from pynini import Far from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, delete_space +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_SIGMA, delete_space) from nemo_text_processing.text_normalization.en.utils import load_labels - NEMO_ALPHA_HE = pynini.union(*"אבגדהוזחטיכלמםנןסעפףצץקרשת").optimize() delete_and = pynutil.delete("ו") delete_optional_and = delete_and.ques @@ -68,9 +68,13 @@ def __init__(self, name: str, kind: str, deterministic: bool = True): self._fst = None self.deterministic = deterministic - self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + self.far_path = Path( + os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far" + ) if self.far_exist(): - self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + self._fst = Far( + self.far_path, mode="r", arc_type="standard", far_type="default" + ).get_fst() def far_exist(self) -> bool: """ @@ -79,14 +83,14 @@ def far_exist(self) -> bool: return self.far_path.exists() @property - def fst(self) -> 'pynini.FstLike': + def fst(self) -> "pynini.FstLike": return self._fst @fst.setter def fst(self, fst): self._fst = fst - def add_tokens(self, fst) -> 'pynini.FstLike': + def add_tokens(self, fst) -> "pynini.FstLike": """ Wraps class name around to given fst @@ -98,7 +102,7 @@ def add_tokens(self, fst) -> 'pynini.FstLike': """ return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") - def delete_tokens(self, fst) -> 'pynini.FstLike': + def delete_tokens(self, fst) -> "pynini.FstLike": """ Deletes class name wrap around output of given fst @@ -117,4 +121,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike': + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py index 2472a9112..f5fc265ab 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py @@ -16,19 +16,11 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - NEMO_ALPHA_HE, - GraphFst, - delete_and, - delete_optional_and, -) -from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path + NEMO_ALPHA_HE, GraphFst, delete_and, delete_optional_and) +from nemo_text_processing.inverse_text_normalization.he.utils import \ + get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, - NEMO_SIGMA, - NEMO_SPACE, - delete_space, - insert_space, -) + NEMO_DIGIT, NEMO_SIGMA, NEMO_SPACE, delete_space, insert_space) from nemo_text_processing.text_normalization.en.utils import load_labels @@ -53,15 +45,19 @@ def __init__(self): delete_space + delete_optional_and + graph_digit, pynutil.insert("0", weight=0.001), ) - graph_two_digit = pynini.union(graph_teen, graph_ties) - self.graph_two_digit = pynini.union(graph_digit, graph_ties, pynutil.add_weight(graph_teen, -0.001)) + + self.graph_two_digit = pynini.union( + graph_digit, graph_ties, pynutil.add_weight(graph_teen, -0.001) + ) # hundreds - hundred = pynini.string_map([("מאה", "1"), ("מאתיים", "2")]) + hundred_exception = pynini.string_file( + get_abs_path("data/numbers/hundreds_exception.tsv") + ) delete_hundred = pynutil.delete("מאות") graph_hundred = delete_optional_and + pynini.union( - hundred, + hundred_exception, graph_digit + delete_space + delete_hundred, pynutil.insert("0", weight=0.001), ) @@ -76,13 +72,14 @@ def __init__(self): pynutil.insert("0") + graph_two_digit, pynutil.insert("00") + graph_digit, ) - self.graph_hundred = graph_hundred @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) ) # thousands - thousand = pynini.string_map([("אלף", "1"), ("אלפיים", "2")]) + thousand_exception = pynini.string_file( + get_abs_path("data/numbers/thousands_exception.tsv") + ) thousand_digit = pynini.string_file(get_abs_path("data/numbers/thousands.tsv")) delete_thousand = pynutil.delete("אלפים") | pynutil.delete("אלף", weight=0.001) @@ -92,72 +89,106 @@ def __init__(self): pynutil.insert("00") + thousand_digit, ) many_thousands = large_number_prefix + delete_space + delete_thousand - graph_thousands = delete_optional_and + pynini.union( - (pynutil.insert("00") + thousand), + (pynutil.insert("00") + thousand_exception), many_thousands, pynutil.insert("000", weight=0.001), ) - self.graph_thousands = pynini.union(graph_thousands + delete_space + graph_hundred, graph_zero) + self.graph_thousands = pynini.union( + graph_thousands + delete_space + graph_hundred, graph_zero + ) self.graph_thousands @= pynini.union( - pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), + pynutil.delete(pynini.closure("0")) + + pynini.difference(NEMO_DIGIT, "0") + + pynini.closure(NEMO_DIGIT), "0", ) # millions - million = pynini.string_map([("מיליון", "001")]) - - delete_millions = pynutil.delete("מיליונים") | pynutil.delete("מיליון", weight=0.001) + million_exceptions = pynini.string_file( + get_abs_path("data/numbers/millions_exception.tsv") + ) + million_exceptions = pynutil.insert("00") + million_exceptions + delete_millions = pynutil.delete("מיליונים") | pynutil.delete( + "מיליון", weight=0.001 + ) many_millions = large_number_prefix + delete_space + delete_millions - - graph_millions = pynini.union(many_millions, million, pynutil.insert("000", weight=0.001)) + graph_millions = pynini.union( + many_millions, million_exceptions, pynutil.insert("000", weight=0.001) + ) graph = pynini.union( - graph_millions + delete_space + graph_thousands + delete_space + graph_hundred, + graph_millions + + delete_space + + graph_thousands + + delete_space + + graph_hundred, graph_zero, ) - graph = graph @ pynini.union( - pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), + pynutil.delete(pynini.closure("0")) + + pynini.difference(NEMO_DIGIT, "0") + + pynini.closure(NEMO_DIGIT), "0", ) labels_exception = load_labels(get_abs_path("data/numbers/digit.tsv")) - labels_exception = list(set([x[0] for x in labels_exception] + ["אפס", "עשר", "עשרה"])) + labels_exception = list( + set([x[0] for x in labels_exception] + ["אפס", "עשר", "עשרה"]) + ) labels_exception += ["ו" + label for label in labels_exception] graph_exception = pynini.union(*labels_exception).optimize() - graph = ((NEMO_ALPHA_HE + NEMO_SIGMA) @ graph).optimize() self.graph_no_exception = graph ### Token insertion - minus_graph = pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE + minus_graph = ( + pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE + ) optional_minus_graph = pynini.closure(minus_graph, 0, 1) optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + pynutil.insert('morphosyntactic_features: "') + + prefix_graph + + pynutil.insert('"') + + insert_space, 0, 1, ) - graph_wo_small_digits = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph + graph_wo_small_digits = ( + pynini.project(graph, "input") - graph_exception.arcsort() + ) @ graph - cardinal_wo_viable_hours = load_labels(get_abs_path("data/numbers/viable_hours.tsv")) + cardinal_wo_viable_hours = load_labels( + get_abs_path("data/numbers/viable_hours.tsv") + ) cardinal_wo_viable_hours = list(set([x[0] for x in cardinal_wo_viable_hours])) viable_hours_exception = pynini.union(*cardinal_wo_viable_hours).optimize() - self.graph_wo_viable_hours = (pynini.project(graph, "input") - viable_hours_exception.arcsort()) @ graph + self.graph_wo_viable_hours = ( + pynini.project(graph, "input") - viable_hours_exception.arcsort() + ) @ graph small_number_with_minus = ( - insert_space + minus_graph + pynutil.insert('integer: "') + self.graph_no_exception + pynutil.insert('"') + insert_space + + minus_graph + + pynutil.insert('integer: "') + + self.graph_no_exception + + pynutil.insert('"') ) big_number_with_optional_minus = ( - optional_minus_graph + pynutil.insert('integer: "') + graph_wo_small_digits + pynutil.insert('"') + optional_minus_graph + + pynutil.insert('integer: "') + + graph_wo_small_digits + + pynutil.insert('"') ) - graph = optional_prefix_graph + (small_number_with_minus | big_number_with_optional_minus) + graph = optional_prefix_graph + ( + small_number_with_minus | big_number_with_optional_minus + ) final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py index dbd71dfe3..5c0b16360 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py @@ -15,9 +15,12 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, insert_space +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import \ + get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + delete_extra_space, delete_space, insert_space) def _get_year_graph(graph_two_digits, graph_thousands): @@ -55,28 +58,47 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): day_graph = pynutil.insert('day: "') + day_graph + pynutil.insert('"') month_names = pynini.string_file(get_abs_path("data/months.tsv")) - month_names_graph = pynutil.insert('month: "') + month_names + pynutil.insert('"') + month_names_graph = ( + pynutil.insert('month: "') + month_names + pynutil.insert('"') + ) - month_name2number = pynini.string_file(get_abs_path("data/months_name2number.tsv")) - month_name2number_graph = pynutil.insert('month: "') + month_name2number + pynutil.insert('"') + month_name2number = pynini.string_file( + get_abs_path("data/months_name2number.tsv") + ) + month_name2number_graph = ( + pynutil.insert('month: "') + month_name2number + pynutil.insert('"') + ) - month_number2number = pynini.string_file(get_abs_path("data/months_ordinal2number.tsv")) - month_number2number_graph = pynutil.insert("month: \"") + month_number2number + pynutil.insert("\"") + month_number2number = pynini.string_file( + get_abs_path("data/months_ordinal2number.tsv") + ) + month_number2number_graph = ( + pynutil.insert('month: "') + month_number2number + pynutil.insert('"') + ) all_month_graph = month_name2number_graph | month_number2number_graph year_graph = _get_year_graph(two_digits_graph, cardinal.graph_thousands) - graph_year = delete_extra_space + pynutil.insert('year: "') + year_graph + pynutil.insert('"') + graph_year = ( + delete_extra_space + + pynutil.insert('year: "') + + year_graph + + pynutil.insert('"') + ) prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) delete_prefix = pynutil.delete(prefix_graph) - graph_prefix = pynutil.insert("morphosyntactic_features: \"") + prefix_graph + pynutil.insert("\"") + graph_prefix = ( + pynutil.insert('morphosyntactic_features: "') + + prefix_graph + + pynutil.insert('"') + ) year_prefix_graph = ( - pynutil.insert("morphosyntactic_features: \"") + pynutil.insert('morphosyntactic_features: "') + pynini.closure(prefix_graph, 0, 1) + pynini.union("שנה", "שנת") - + pynutil.insert("\"") + + pynutil.insert('"') ) graph_dm = ( @@ -98,7 +120,11 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): + graph_year ) - graph_my = pynini.closure(graph_prefix + insert_space, 0, 1) + month_names_graph + graph_year + graph_my = ( + pynini.closure(graph_prefix + insert_space, 0, 1) + + month_names_graph + + graph_year + ) graph_y_only = year_prefix_graph + graph_year final_graph = graph_dm | graph_dmy | graph_my | graph_y_only diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py index 2a3a319b2..253f78f88 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py @@ -15,33 +15,32 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import MINUS, GraphFst, delete_and -from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + MINUS, GraphFst, delete_and) +from nemo_text_processing.inverse_text_normalization.he.utils import \ + get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, - delete_extra_space, - delete_space, - delete_zero_or_one_space, - insert_space, -) + NEMO_DIGIT, delete_extra_space, delete_space, delete_zero_or_one_space, + insert_space) -def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike": +def get_quantity( + decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike" +) -> "pynini.FstLike": """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral in Hebrew, - e.g. one million -> integer_part: "1" quantity: "million" - e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million" Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ numbers = cardinal_up_to_hundred @ ( - pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + pynutil.delete(pynini.closure("0")) + + pynini.difference(NEMO_DIGIT, "0") + + pynini.closure(NEMO_DIGIT) ) - suffix_labels = ["אלף", "מיליון", "מיליארד"] - suffix_labels = [x for x in suffix_labels if x != "אלף"] + suffix_labels = ["מיליון", "מיליארד"] suffix = pynini.union(*suffix_labels).optimize() res = ( @@ -53,8 +52,13 @@ def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstL + suffix + pynutil.insert('"') ) - res |= decimal + delete_extra_space + pynutil.insert('quantity: "') + (suffix | "אלף") + pynutil.insert('"') - + res |= ( + decimal + + delete_extra_space + + pynutil.insert('quantity: "') + + (suffix | "אלף") + + pynutil.insert('"') + ) return res @@ -75,7 +79,10 @@ def __init__(self, cardinal: GraphFst): prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + pynutil.insert('morphosyntactic_features: "') + + prefix_graph + + pynutil.insert('"') + + insert_space, 0, 1, ) @@ -84,37 +91,59 @@ def __init__(self, cardinal: GraphFst): cardinal_graph = cardinal.graph_no_exception # all fractions - fractions = pynini.string_file(get_abs_path("data/numbers/decimal_fractions.tsv")) + fractions = pynini.string_file( + get_abs_path("data/numbers/decimal_fractions.tsv") + ) fractions_graph = delete_zero_or_one_space + delete_and + fractions - fractions_graph = pynutil.insert('fractional_part: "') + fractions_graph + pynutil.insert('"') + fractions_graph = ( + pynutil.insert('fractional_part: "') + fractions_graph + pynutil.insert('"') + ) - # identify decimals that can be understood time, and don't convert them to avoid ambiguity - viable_minutes_verbose = ["חצי", "רבע"] - viable_minutes_exception = pynini.union(*viable_minutes_verbose).optimize() - fractions_wo_minutes = (pynini.project(fractions, "input") - viable_minutes_exception.arcsort()) @ fractions - fractions_wo_minutes = delete_zero_or_one_space + delete_and + fractions_wo_minutes - fractions_wo_minutes = pynutil.insert('fractional_part: "') + fractions_wo_minutes + pynutil.insert('"') + # identify decimals that can be understood as time, don't convert them to avoid ambiguity + viable_minutes_exception = pynini.string_file( + get_abs_path("data/decimals/minutes_exception.tsv") + ) + fractions_wo_minutes = ( + pynini.project(fractions, "input") - viable_minutes_exception.arcsort() + ) @ fractions + fractions_wo_minutes = ( + delete_zero_or_one_space + delete_and + fractions_wo_minutes + ) + fractions_wo_minutes = ( + pynutil.insert('fractional_part: "') + + fractions_wo_minutes + + pynutil.insert('"') + ) graph_decimal = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_decimal |= cardinal.graph_two_digit - graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal self.graph = graph_decimal point = pynutil.delete("נקודה") - graph_negative = pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space + graph_negative = ( + pynutil.insert("negative: ") + + pynini.cross(MINUS, '"true"') + + delete_extra_space + ) optional_graph_negative = pynini.closure( graph_negative, 0, 1, ) - graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') - graph_fractional = pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"') + graph_integer = ( + pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + ) + graph_fractional = ( + pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"') + ) # integer could be an hour, but minutes cannot: convert to decimal - viable_hour_unviable_minutes = graph_integer + delete_extra_space + fractions_wo_minutes + viable_hour_unviable_minutes = ( + graph_integer + delete_extra_space + fractions_wo_minutes + ) # integer cannot be an hour, but minutes can: convert to decimal unviable_hour_viable_minutes = ( @@ -126,22 +155,33 @@ def __init__(self, cardinal: GraphFst): ) # minus sign followed by ambiguous decimal: convert to decimal, there is no negative time - negative_viable_time = graph_negative + graph_integer + delete_extra_space + fractions_graph + negative_viable_time = ( + graph_negative + graph_integer + delete_extra_space + fractions_graph + ) # all decimals with fractions, not excluding anything (used in other FSTs) all_decimals_wo_point = graph_integer + delete_extra_space + fractions_graph # only cases with fractional part that cannot be interpreted as time - graph_wo_point = viable_hour_unviable_minutes | unviable_hour_viable_minutes | negative_viable_time + graph_wo_point = ( + viable_hour_unviable_minutes + | unviable_hour_viable_minutes + | negative_viable_time + ) # all decimals with the word "point" graph_w_point = ( - pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional + pynini.closure(graph_integer + delete_extra_space, 0, 1) + + point + + delete_extra_space + + graph_fractional ) final_graph_wo_sign = graph_w_point | graph_wo_point self.final_graph_wo_sign = graph_w_point | all_decimals_wo_point - final_graph = optional_prefix_graph + optional_graph_negative + final_graph_wo_sign + final_graph = ( + optional_prefix_graph + optional_graph_negative + final_graph_wo_sign + ) quantity_graph = get_quantity(self.final_graph_wo_sign, cardinal.graph_hundred) final_graph |= optional_prefix_graph + optional_graph_negative + quantity_graph diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py index 18d4d030c..e849e5a80 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py @@ -15,28 +15,28 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst -from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import \ + CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import \ + DecimalFst +from nemo_text_processing.inverse_text_normalization.he.utils import \ + get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_SPACE, - delete_extra_space, - delete_space, - delete_zero_or_one_space, - insert_space, -) + NEMO_SPACE, delete_extra_space, delete_space, delete_zero_or_one_space, + insert_space) class MeasureFst(GraphFst): """ Finite state transducer for classifying measure in Hebrew e.g. מש עשרה אחוז -> measure { cardinal { integer: "15" } units: "%" } - e.g. מינוס חמש עשרה אחוז -> measure { negative: "-" cardinal { integer: "15" } units: "%" } - e.g. שלוש מיליגרם -> measure { cardinal { integer: "3" } spaced_units: "מ״ג" } + e.g. מינוס חמש עשרה אחוז -> measure { cardinal { negative: "-" integer: "15" } units: "%" } + e.g. שלוש מיליגרם -> measure { cardinal { integer: "3" } units: "מ״ג" } e.g. אלף אחוז -> measure { cardinal { integer: "1000" } units: "%" } e.g. אחוז אחד -> measure { units: "%" cardinal { integer: "1" } } - e.g. סנטימטר אחד -> measure { spaced_units: "ס״מ" cardinal { integer: "1" } } + e.g. סנטימטר אחד -> measure { units: "ס״מ" cardinal { integer: "1" } } Args: cardinal: CardinalFst @@ -48,14 +48,17 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): # optional negative sign optional_graph_negative = pynini.closure( - pynutil.insert("code_switch: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE, + pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE, 0, 1, ) prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + pynutil.insert('morphosyntactic_features: "') + + prefix_graph + + pynutil.insert('"') + + insert_space, 0, 1, ) @@ -65,11 +68,16 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): # Let singular apply to values > 1 as they could be part of an adjective phrase (e.g. 14 foot tall building) subgraph_decimal = ( - pynutil.insert("decimal { ") + decimal.final_graph_wo_sign + pynutil.insert(" }") + delete_extra_space + pynutil.insert("decimal { ") + + optional_graph_negative + + decimal.final_graph_wo_sign + + pynutil.insert(" }") + + delete_extra_space ) subgraph_cardinal = ( pynutil.insert("cardinal { ") + + optional_graph_negative + pynutil.insert('integer: "') + cardinal_graph + pynutil.insert('"') @@ -84,7 +92,9 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): spaced_units = pynini.string_file(get_abs_path("data/spaced_measurements.tsv")) spaced_units = pynini.invert(spaced_units) - spaced_units = pynutil.insert('units: "\[SPACE\]') + spaced_units + pynutil.insert('"') # noqa: W605 + spaced_units = ( + pynutil.insert('units: "\[SPACE\]') + spaced_units + pynutil.insert('"') + ) # noqa: W605 # in joint units the unit is concatenated to the number, in spaced unit separate the unit with a space units_graph = joined_units | spaced_units @@ -102,8 +112,12 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): ) number_graph = subgraph_decimal | subgraph_cardinal - number_unit_graph = (number_graph + units_graph) | (units_graph + delete_space + one_graph) + number_unit_graph = (number_graph + units_graph) | ( + units_graph + delete_space + one_graph + ) - final_graph = optional_prefix_graph + optional_graph_negative + number_unit_graph + delete_zero_or_one_space + final_graph = ( + optional_prefix_graph + number_unit_graph + delete_zero_or_one_space + ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py index c7306ea43..c61494350 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py @@ -15,8 +15,10 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import \ + get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py index b963e7b74..a0db2eec1 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py @@ -15,7 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst class PunctuationFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py index 43e07b932..07c6c8132 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py @@ -15,16 +15,15 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, delete_and -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path, integer_to_text +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + GraphFst, delete_and) +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import \ + CardinalFst +from nemo_text_processing.inverse_text_normalization.he.utils import ( + get_abs_path, integer_to_text) from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, - delete_extra_space, - delete_space, - delete_zero_or_one_space, - insert_space, -) + NEMO_DIGIT, delete_extra_space, delete_space, delete_zero_or_one_space, + insert_space) class TimeFst(GraphFst): @@ -32,7 +31,7 @@ class TimeFst(GraphFst): Finite state transducer for classifying time in Hebrew. Conversion is made only when am / pm time is not ambiguous! e.g. שלוש דקות לחצות -> time { minutes: "57" hours: "23" } - e.g. באחת ושתי דקות בצהריים -> time { prefix: "ב" hours: "1" minutes: "02" suffix: "צהריים" } + e.g. באחת ושתי דקות בצהריים -> time { morphosyntactic_features: "ב" hours: "1" minutes: "02" suffix: "צהריים" } e.g. שתיים ועשרה בבוקר -> time { hours: "2" minutes: "10" suffix: "בוקר" } e.g. שתיים ועשרה בצהריים -> time { hours: "2" minutes: "10" suffix: "צהריים" } e.g. שתיים עשרה ושלוש דקות אחרי הצהריים -> time { hours: "12" minutes: "03" suffix: "צהריים" } @@ -44,76 +43,85 @@ def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period + midnight_to_hour_graph = pynini.string_file( + get_abs_path("data/time/midnight_to_hour.tsv") + ) to_hour_graph = pynini.string_file(get_abs_path("data/time/to_hour.tsv")) + + minute_verbose_graph = pynini.string_file( + get_abs_path("data/time/minute_verbose.tsv") + ) minute_to_graph = pynini.string_file(get_abs_path("data/time/minute_to.tsv")) - suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv")) + minute_to_verbose_graph = pynini.string_file( + get_abs_path("data/time/minute_to_verbose.tsv") + ) + + suffix_graph = pynini.union( + pynini.string_file(get_abs_path("data/time/day_suffix.tsv")), + pynini.string_file(get_abs_path("data/time/noon_suffix.tsv")), + pynini.string_file(get_abs_path("data/time/evening_suffix.tsv")), + pynini.string_file(get_abs_path("data/time/night_suffix.tsv")), + ) time_prefix = pynini.string_file(get_abs_path("data/prefix.tsv")) time_prefix_graph = ( - pynutil.insert("morphosyntactic_features: \"") + time_prefix + pynutil.insert("\"") + insert_space + pynutil.insert('morphosyntactic_features: "') + + time_prefix + + pynutil.insert('"') + + insert_space ) - optional_time_prefix_graph = pynini.closure(time_prefix_graph, 0, 1) - graph_minute_verbose = pynini.string_map( - [ - ("שלושת רבעי", "45"), - ("חצי", "30"), - ("רבע", "15"), - ("עשרים", "20"), - ("עשרה", "10"), - ("חמישה", "05"), - ("דקה", "01"), - ("שתי", "02"), - ] - ) - - graph_minute_to_verbose = pynini.string_map( - [ - ("רבע", "45"), - ("עשרה", "50"), - ("חמישה", "55"), - ("עשרים", "40"), - ("עשרים וחמישה", "35"), - ("דקה", "59"), - ] - ) - # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [integer_to_text(x, only_fem=True)[0] for x in range(1, 13)] - labels_minute_single = [integer_to_text(x, only_fem=True)[0] for x in range(2, 10)] - labels_minute_double = [integer_to_text(x, only_fem=True)[0] for x in range(10, 60)] + labels_minute_single = [ + integer_to_text(x, only_fem=True)[0] for x in range(2, 10) + ] + labels_minute_double = [ + integer_to_text(x, only_fem=True)[0] for x in range(10, 60) + ] - midnight = pynini.string_map([("חצות", "0")]) graph_hour = pynini.union(*labels_hour) @ cardinal - graph_hour |= midnight + graph_hour |= midnight_to_hour_graph add_leading_zero_to_double_digit = pynutil.insert("0") + NEMO_DIGIT - graph_minute_single = pynini.union(*labels_minute_single) @ cardinal @ add_leading_zero_to_double_digit + graph_minute_single = ( + pynini.union(*labels_minute_single) + @ cardinal + @ add_leading_zero_to_double_digit + ) graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"') - graph_minute = pynini.union(pynutil.insert("00"), graph_minute_single, graph_minute_double) + graph_minute = pynini.union( + pynutil.insert("00"), graph_minute_single, graph_minute_double + ) final_suffix = pynutil.insert('suffix: "') + suffix_graph + pynutil.insert('"') final_suffix = delete_space + insert_space + final_suffix + time_word = "דקות" + optional_delete_time = pynini.closure( + delete_space + pynutil.delete(time_word), 0, 1 + ) graph_h_and_m = ( final_graph_hour + delete_space + delete_and + insert_space + pynutil.insert('minutes: "') - + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose) + + pynini.union( + graph_minute_single, graph_minute_double, minute_verbose_graph + ) + pynutil.insert('"') - + (pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1)) + + optional_delete_time ) graph_special_m_to_h_suffix_time = ( pynutil.insert('minutes: "') - + graph_minute_to_verbose + + minute_to_verbose_graph + pynutil.insert('"') + delete_space + pynutil.delete("ל") @@ -127,7 +135,7 @@ def __init__(self): pynutil.insert('minutes: "') + pynini.union(graph_minute_single, graph_minute_double) @ minute_to_graph + pynutil.insert('"') - + pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1) + + optional_delete_time + delete_space + pynutil.delete("ל") + insert_space @@ -151,7 +159,7 @@ def __init__(self): optional_time_prefix_graph + delete_zero_or_one_space + pynutil.insert('hours: "') - + midnight + + midnight_to_hour_graph + pynutil.insert('"') + insert_space + pynutil.insert('minutes: "') @@ -161,20 +169,22 @@ def __init__(self): graph_midnight_and_m = ( pynutil.insert('hours: "') - + midnight + + midnight_to_hour_graph + pynutil.insert('"') + delete_space + delete_and + insert_space + pynutil.insert('minutes: "') - + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose) + + pynini.union( + graph_minute_single, graph_minute_double, minute_verbose_graph + ) + pynutil.insert('"') - + (pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1)) + + optional_delete_time ) to_midnight_verbose_graph = ( pynutil.insert('minutes: "') - + graph_minute_to_verbose + + minute_to_verbose_graph + pynutil.insert('"') + delete_space + pynutil.delete("ל") @@ -188,7 +198,7 @@ def __init__(self): pynutil.insert('minutes: "') + pynini.union(graph_minute_single, graph_minute_double) @ minute_to_graph + pynutil.insert('"') - + pynini.closure(delete_space + pynutil.delete("דקות"), 0, 1) + + optional_delete_time + delete_space + pynutil.delete("ל") + insert_space @@ -200,13 +210,22 @@ def __init__(self): final_graph_midnight = ( optional_time_prefix_graph + delete_zero_or_one_space - + (midnight_graph | to_midnight_verbose_graph | graph_m_to_midnight | graph_midnight_and_m) + + ( + midnight_graph + | to_midnight_verbose_graph + | graph_m_to_midnight + | graph_midnight_and_m + ) ) final_graph = ( optional_time_prefix_graph + delete_zero_or_one_space - + (graph_h_and_m | graph_special_m_to_h_suffix_time | graph_m_to_h_suffix_time) + + ( + graph_h_and_m + | graph_special_m_to_h_suffix_time + | graph_m_to_h_suffix_time + ) + final_suffix ) final_graph |= graph_h diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py index 807dcf734..361b8d007 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py @@ -18,17 +18,28 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.date import DateFst -from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.measure import MeasureFst -from nemo_text_processing.inverse_text_normalization.he.taggers.ordinal import OrdinalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.punctuation import PunctuationFst -from nemo_text_processing.inverse_text_normalization.he.taggers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.he.taggers.whitelist import WhiteListFst -from nemo_text_processing.inverse_text_normalization.he.taggers.word import WordFst -from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, generator_main +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import \ + CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.date import \ + DateFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import \ + DecimalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.measure import \ + MeasureFst +from nemo_text_processing.inverse_text_normalization.he.taggers.ordinal import \ + OrdinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.punctuation import \ + PunctuationFst +from nemo_text_processing.inverse_text_normalization.he.taggers.time import \ + TimeFst +from nemo_text_processing.inverse_text_normalization.he.taggers.whitelist import \ + WhiteListFst +from nemo_text_processing.inverse_text_normalization.he.taggers.word import \ + WordFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + delete_extra_space, delete_space, generator_main) class ClassifyFst(GraphFst): @@ -88,13 +99,21 @@ def __init__( | pynutil.add_weight(word_graph, 100) ) - punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + punct = ( + pynutil.insert("tokens { ") + + pynutil.add_weight(punct_graph, weight=1.1) + + pynutil.insert(" }") + ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + pynini.closure(punct + pynutil.insert(" ")) + + token + + pynini.closure(pynutil.insert(" ") + punct) ) - graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = token_plus_punct + pynini.closure( + delete_extra_space + token_plus_punct + ) graph = delete_space + graph + delete_space self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py index 58de7668e..2273a6435 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py @@ -17,9 +17,12 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, string_map_cased -from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import convert_space, insert_space +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + GraphFst, string_map_cased) +from nemo_text_processing.inverse_text_normalization.he.utils import \ + get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + convert_space, insert_space) class WhiteListFst(GraphFst): @@ -45,11 +48,16 @@ def __init__(self, input_file: str = None): raise ValueError(f"Whitelist file {input_file} not found") optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + pynutil.insert('morphosyntactic_features: "') + + prefix_graph + + pynutil.insert('"') + + insert_space, 0, 1, ) whitelist = string_map_cased(input_file) - graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"') + graph = ( + pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"') + ) final_graph = optional_prefix_graph + graph self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py index 6b5394ac3..6c0ee4fd9 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py @@ -15,8 +15,10 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import \ + NEMO_NOT_SPACE class WordFst(GraphFst): @@ -27,5 +29,9 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = pynutil.insert('name: "') + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert('"') + word = ( + pynutil.insert('name: "') + + pynini.closure(NEMO_NOT_SPACE, 1) + + pynutil.insert('"') + ) self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/utils.py b/nemo_text_processing/inverse_text_normalization/he/utils.py index 42a89dd78..3348c9f86 100644 --- a/nemo_text_processing/inverse_text_normalization/he/utils.py +++ b/nemo_text_processing/inverse_text_normalization/he/utils.py @@ -54,7 +54,10 @@ "9": "תשעים", } -ten = {"short": "עשר", "long": "עשרה"} # double pronunciation: short is 'eser' and 'asar', long is 'esre' and 'asara' +ten = { + "short": "עשר", + "long": "עשרה", +} # double pronunciation: short is 'eser' and 'asar', long is 'esre' and 'asara' ############# @@ -69,7 +72,7 @@ def get_abs_path(rel_path): Returns absolute path """ - return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path def augment_labels_with_punct_at_end(labels): @@ -92,7 +95,7 @@ def augment_labels_with_punct_at_end(labels): def digit_by_digit(num): - dbd = [' '.join([units_feminine_dict[digit] for digit in num])] + dbd = [" ".join([units_feminine_dict[digit] for digit in num])] # generate "1" as masculine and as feminine if exists if units_feminine_dict["1"] in dbd[0]: @@ -106,7 +109,7 @@ def integer_to_text(num, only_fem=False): num = str(num) # number is zero if num == len(num) * "0": - return ['אפס'] + return ["אפס"] else: # remove leading zeros from number num = num.lstrip("0") @@ -174,6 +177,8 @@ def _less_than_100(num, only_fem=False): else: res.append(f'{tens_dict[num[0]]} {"ו"}{units_feminine_dict[num[1]]}') if not only_fem: - res.append(f'{tens_dict[num[0]]} {"ו"}{units_masculine_dict[num[1]]}') + res.append( + f'{tens_dict[num[0]]} {"ו"}{units_masculine_dict[num[1]]}' + ) return res diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py index d26e1f703..11a14695b 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py @@ -15,8 +15,10 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + NEMO_ALPHA_HE, GraphFst) +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, delete_space) class CardinalFst(GraphFst): @@ -36,7 +38,10 @@ def __init__(self): at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # Thousands separator - group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + group_by_threes = ( + at_most_three_digits + + (pynutil.insert(",") + exactly_three_digits).closure() + ) # Keep the prefix if exists and add a dash optional_prefix = pynini.closure( diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py index fc6211d16..9379fbf50 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py @@ -15,14 +15,11 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_NOT_QUOTE, - NEMO_SPACE, - delete_space, - delete_zero_or_one_space, - insert_space, -) + NEMO_NOT_QUOTE, NEMO_SPACE, delete_space, delete_zero_or_one_space, + insert_space) class DateFst(GraphFst): @@ -37,54 +34,54 @@ def __init__(self): day_prefix = ( pynutil.delete("morphosyntactic_features:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.insert('-') - + pynutil.delete("\"") + + pynutil.insert("-") + + pynutil.delete('"') ) day = ( pynutil.delete("day:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1, 2) - + pynutil.insert('.') - + pynutil.delete("\"") + + pynutil.insert(".") + + pynutil.delete('"') + delete_space ) month_prefix = ( pynutil.delete("morphosyntactic_features:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') + delete_space ) month = ( pynutil.delete("month:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') ) year_prefix = ( pynutil.delete("morphosyntactic_features:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 3) - + pynutil.delete("\"") + + pynutil.delete('"') + delete_space ) year = ( pynutil.delete("year:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') ) ####################### @@ -101,7 +98,13 @@ def __init__(self): ) # day month and year - graph_dmy = graph_dm + delete_space + pynutil.insert('.') + delete_zero_or_one_space + year + graph_dmy = ( + graph_dm + + delete_space + + pynutil.insert(".") + + delete_zero_or_one_space + + year + ) # only month and year graph_my = ( diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py index 86f20f882..d4d3862d2 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py @@ -15,8 +15,10 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_NOT_QUOTE, delete_space +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + NEMO_ALPHA_HE, GraphFst) +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, NEMO_NOT_QUOTE, delete_space) class DecimalFst(GraphFst): @@ -30,21 +32,26 @@ class DecimalFst(GraphFst): def __init__(self): super().__init__(name="decimal", kind="verbalize") - optionl_sign = pynini.closure(pynini.cross("negative: \"true\"", "-") + delete_space, 0, 1) + optionl_sign = pynini.closure( + pynini.cross('negative: "true"', "-") + delete_space, 0, 1 + ) # Need parser to group digits by threes exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # Thousands separator - group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + group_by_threes = ( + at_most_three_digits + + (pynutil.insert(",") + exactly_three_digits).closure() + ) integer = ( pynutil.delete("integer_part:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') ) integer = integer @ group_by_threes @@ -55,35 +62,39 @@ def __init__(self): pynutil.insert(".") + pynutil.delete("fractional_part:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') ) optional_fractional = pynini.closure(fractional + delete_space, 0, 1) quantity = ( pynutil.delete("quantity:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') + ) + optional_quantity = pynini.closure( + pynutil.insert(" ") + quantity + delete_space, 0, 1 ) - optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1) # Keep the prefix if exists and add a dash optional_prefix = pynini.closure( pynutil.delete("morphosyntactic_features:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_ALPHA_HE, 1) + pynutil.insert("-") - + pynutil.delete("\"") + + pynutil.delete('"') + delete_space, 0, 1, ) - graph = optional_prefix + optional_integer + optional_fractional + optional_quantity + graph = ( + optional_prefix + optional_integer + optional_fractional + optional_quantity + ) self.numbers = graph graph = optionl_sign + graph delete_tokens = self.delete_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py index c2ae11e54..c6bcb9ecb 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py @@ -15,25 +15,21 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_CHAR, - NEMO_NOT_QUOTE, - NEMO_SIGMA, - NEMO_SPACE, - delete_space, -) + NEMO_CHAR, NEMO_NOT_QUOTE, NEMO_SIGMA, NEMO_SPACE, delete_space) class MeasureFst(GraphFst): """ Finite state transducer for verbalizing measure, in Hebrew. Some measures are concatenated to the numbers and other are don't (two measure lists) - e.g. measure { cardinal { integer: "3" } spaced_units: "מ״ג" } -> 3 מ״ג + e.g. measure { cardinal { integer: "3" } units: "מ״ג" } -> 3 מ״ג e.g. measure { cardinal { integer: "1000" } units: "%" } -> 1,000% e.g. measure { units: "%" cardinal { integer: "1" } } -> 1% - e.g. measure { spaced_units: "ס״מ" cardinal { integer: "1" } } -> 1 ס״מ - e.g. measure { prefix: "ל" cardinal { integer: "4" } spaced_units: "ס״מ" } -> ל-4 ס״מ + e.g. measure { units: "ס״מ" cardinal { integer: "1" } } -> 1 ס״מ + e.g. measure { prefix: "ל" cardinal { integer: "4" } units: "ס״מ" } -> ל-4 ס״מ Args: decimal: DecimalFst @@ -57,7 +53,7 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst): # Removes the negative attribute and leaves the sign if occurs optional_sign = pynini.closure( - pynutil.delete("code_switch:") + pynutil.delete("negative:") + delete_space + pynutil.delete('"') + pynini.accep("-") @@ -68,11 +64,21 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst): ) graph_decimal = ( - pynutil.delete("decimal {") + delete_space + decimal.numbers + delete_space + pynutil.delete("}") + pynutil.delete("decimal {") + + delete_space + + optional_sign + + decimal.numbers + + delete_space + + pynutil.delete("}") ) graph_cardinal = ( - pynutil.delete("cardinal {") + delete_space + cardinal.numbers + delete_space + pynutil.delete("}") + pynutil.delete("cardinal {") + + delete_space + + optional_sign + + cardinal.numbers + + delete_space + + pynutil.delete("}") ) unit = ( @@ -90,8 +96,13 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst): numbers_units = delete_space + unit numbers_graph = (graph_cardinal | graph_decimal) + numbers_units - one_graph = delete_space + pynutil.insert("1") + unit + pynutil.delete('cardinal { integer: "1" }') + one_graph = ( + delete_space + + pynutil.insert("1") + + unit + + pynutil.delete('cardinal { integer: "1" }') + ) - graph = optional_prefix + optional_sign + (numbers_graph | one_graph) + graph = optional_prefix + (numbers_graph | one_graph) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py index e71b76ecb..fb63ba0aa 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py @@ -15,8 +15,10 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, delete_space) class OrdinalFst(GraphFst): @@ -30,9 +32,9 @@ def __init__(self): graph = ( pynutil.delete("integer:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_DIGIT, 1) - + pynutil.delete("\"") + + pynutil.delete('"') ) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py index 110952dac..9d412a1a8 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py @@ -15,14 +15,13 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import \ + get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, - NEMO_NOT_QUOTE, - delete_space, - delete_zero_or_one_space, - insert_space, -) + NEMO_DIGIT, NEMO_NOT_QUOTE, delete_space, delete_zero_or_one_space, + insert_space) class TimeFst(GraphFst): @@ -30,98 +29,77 @@ class TimeFst(GraphFst): Finite state transducer for verbalizing time in Hebrew e.g. time { hours: "2" minutes: "55" suffix: "בלילה" } -> 2:55 בלילה e.g. time { hours: "2" minutes: "57" suffix: "בבוקר" } -> 2:57 בבוקר - e.g. time { prefix: "ב" hours: "6" minutes: "32" suffix: "בערב" } -> ב-18:32 בערב - e.g. time { prefix: "בשעה" hours: "2" minutes: "10" suffix: "בצהריים" } -> בשעה-14:10 בצהריים + e.g. time { morphosyntactic_features: "ב" hours: "6" minutes: "32" suffix: "בערב" } -> ב-18:32 בערב + e.g. time { morphosyntactic_features: "בשעה" hours: "2" minutes: "10" suffix: "בצהריים" } -> בשעה-14:10 בצהריים """ def __init__(self): super().__init__(name="time", kind="verbalize") - hour_to_noon = pynini.string_map( - [ - ("12", "12"), - ("1", "13"), - ("2", "14"), - ("3", "15"), - ("4", "16"), - ("5", "17"), - ("6", "18"), - ] - ) - - hour_to_evening = pynini.string_map( - [ - ("5", "17"), - ("6", "18"), - ("7", "19"), - ("8", "20"), - ("9", "21"), - ("10", "22"), - ("11", "23"), - ] - ) - - hour_to_night = pynini.string_map( - [ - ("8", "20"), - ("9", "21"), - ("10", "22"), - ("11", "23"), - ("12", "0"), - ("1", "1"), - ("2", "2"), - ("3", "3"), - ("4", "4"), - ] + hour_to_noon = pynini.string_file(get_abs_path("data/time/hour_to_noon.tsv")) + hour_to_evening = pynini.string_file( + get_abs_path("data/time/hour_to_evening.tsv") ) + hour_to_night = pynini.string_file(get_abs_path("data/time/hour_to_night.tsv")) + day_suffixes = pynini.string_file(get_abs_path("data/time/day_suffix.tsv")) day_suffixes = ( insert_space - + pynutil.delete("suffix: \"") - + (pynini.accep("בבוקר") | pynini.accep("לפנות בוקר")) - + pynutil.delete("\"") + + pynutil.delete('suffix: "') + + day_suffixes + + pynutil.delete('"') ) + noon_suffixes = pynini.string_file(get_abs_path("data/time/noon_suffix.tsv")) noon_suffixes = ( insert_space - + pynutil.delete("suffix: \"") - + (pynini.accep("בצהריים") | pynini.accep("אחרי הצהריים") | pynini.accep("אחר הצהריים")) - + pynutil.delete("\"") + + pynutil.delete('suffix: "') + + noon_suffixes + + pynutil.delete('"') ) + evening_suffixes = pynini.string_file( + get_abs_path("data/time/evening_suffix.tsv") + ) evening_suffixes = ( insert_space - + pynutil.delete("suffix: \"") - + (pynini.accep("בערב") | pynini.accep("לפנות ערב")) - + pynutil.delete("\"") + + pynutil.delete('suffix: "') + + evening_suffixes + + pynutil.delete('"') ) - night_suffixes = insert_space + pynutil.delete("suffix: \"") + pynini.accep("בלילה") + pynutil.delete("\"") + night_suffixes = pynini.string_file(get_abs_path("data/time/night_suffix.tsv")) + night_suffixes = ( + insert_space + + pynutil.delete('suffix: "') + + night_suffixes + + pynutil.delete('"') + ) hour = ( pynutil.delete("hours:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_DIGIT, 1) - + pynutil.delete("\"") + + pynutil.delete('"') ) minute = ( pynutil.delete("minutes:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_DIGIT, 1) - + pynutil.delete("\"") + + pynutil.delete('"') ) prefix = ( pynutil.delete("morphosyntactic_features:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.insert("-") - + pynutil.delete("\"") + + pynutil.delete('"') ) optional_prefix = pynini.closure(prefix + delete_zero_or_one_space, 0, 1) @@ -129,9 +107,17 @@ def __init__(self): graph = hour + delete_space + pynutil.insert(":") + minute + optional_suffix for hour_to, suffix in zip( - [hour_to_noon, hour_to_evening, hour_to_night], [noon_suffixes, evening_suffixes, night_suffixes] + [hour_to_noon, hour_to_evening, hour_to_night], + [noon_suffixes, evening_suffixes, night_suffixes], ): - graph |= hour @ hour_to + delete_space + pynutil.insert(":") + minute + delete_space + suffix + graph |= ( + hour @ hour_to + + delete_space + + pynutil.insert(":") + + minute + + delete_space + + suffix + ) graph |= optional_prefix + graph delete_tokens = self.delete_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py index 0223259db..bd2a39c52 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py @@ -12,14 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.date import DateFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.decimal import DecimalFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.measure import MeasureFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.ordinal import OrdinalFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.cardinal import \ + CardinalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.date import \ + DateFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.decimal import \ + DecimalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.measure import \ + MeasureFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.ordinal import \ + OrdinalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.time import \ + TimeFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.whitelist import \ + WhiteListFst class VerbalizeFst(GraphFst): @@ -49,6 +57,12 @@ def __init__(self): whitelist_graph = WhiteListFst().fst graph = ( - time_graph | date_graph | measure_graph | ordinal_graph | decimal_graph | cardinal_graph | whitelist_graph + time_graph + | date_graph + | measure_graph + | ordinal_graph + | decimal_graph + | cardinal_graph + | whitelist_graph ) self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py index 611181df4..261d11092 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py @@ -15,10 +15,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.word import WordFst -from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize import \ + VerbalizeFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.word import \ + WordFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + delete_extra_space, delete_space) class VerbalizeFinalFst(GraphFst): @@ -40,5 +44,10 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + delete_space + pynutil.delete("}") ) - graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + graph = ( + delete_space + + pynini.closure(graph + delete_extra_space) + + graph + + delete_space + ) self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py index 9446614a9..5cb519c14 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py @@ -15,8 +15,10 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, delete_space +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + NEMO_ALPHA_HE, GraphFst) +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_CHAR, NEMO_SIGMA, delete_space) class WhiteListFst(GraphFst): @@ -31,9 +33,9 @@ def __init__(self): optional_prefix = pynini.closure( pynutil.delete("morphosyntactic_features:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_ALPHA_HE, 1) - + pynutil.delete("\"") + + pynutil.delete('"') + delete_space, 0, 1, @@ -41,10 +43,12 @@ def __init__(self): graph = ( pynutil.delete("name:") + delete_space - + pynutil.delete("\"") + + pynutil.delete('"') + pynini.closure(NEMO_CHAR - " ", 1) - + pynutil.delete("\"") + + pynutil.delete('"') + ) + graph = graph @ pynini.cdrewrite( + pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) final_graph = optional_prefix + graph self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py index 0083ecbd1..858ca8d60 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py @@ -15,8 +15,10 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, delete_space +from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ + GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_CHAR, NEMO_SIGMA, delete_space) class WordFst(GraphFst): @@ -28,7 +30,13 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) - char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) + char = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete('"') + + chars + + pynutil.delete('"') + ) + graph = char @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt index 43995e498..3d0a40a07 100644 --- a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt @@ -1,5 +1,6 @@ מינוס חמש עשרה אחוז~-15% חמש עשרה אחוז~15% +מינוס שתים עשרה נקודה חמש מעלות ~-12.5° שתיים עשרה נקודה חמש מעלות~12.5° שתיים עשרה נקודה חמש מעלות צלסיוס~12.5°C אלף אחוזים~1,000% From 9dd3c358723a20fd591a6794fd9b4fe189a8e3c0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Nov 2025 04:20:25 +0000 Subject: [PATCH 9/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../he/graph_utils.py | 11 +-- .../he/taggers/cardinal.py | 98 ++++++------------- .../he/taggers/date.py | 48 +++------ .../he/taggers/decimal.py | 98 +++++-------------- .../he/taggers/measure.py | 37 +++---- .../he/taggers/ordinal.py | 6 +- .../he/taggers/punctuation.py | 3 +- .../he/taggers/time.py | 77 +++++---------- .../he/taggers/tokenize_and_classify.py | 47 +++------ .../he/taggers/whitelist.py | 18 +--- .../he/taggers/word.py | 12 +-- .../inverse_text_normalization/he/utils.py | 4 +- .../he/verbalizers/cardinal.py | 11 +-- .../he/verbalizers/date.py | 19 ++-- .../he/verbalizers/decimal.py | 23 ++--- .../he/verbalizers/measure.py | 17 ++-- .../he/verbalizers/ordinal.py | 6 +- .../he/verbalizers/time.py | 59 +++-------- .../he/verbalizers/verbalize.py | 32 ++---- .../he/verbalizers/verbalize_final.py | 19 +--- .../he/verbalizers/whitelist.py | 10 +- .../he/verbalizers/word.py | 14 +-- 22 files changed, 195 insertions(+), 474 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/he/graph_utils.py b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py index 02642ea3d..072da0381 100644 --- a/nemo_text_processing/inverse_text_normalization/he/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py @@ -19,8 +19,7 @@ from pynini import Far from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_SIGMA, delete_space) +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, delete_space from nemo_text_processing.text_normalization.en.utils import load_labels NEMO_ALPHA_HE = pynini.union(*"אבגדהוזחטיכלמםנןסעפףצץקרשת").optimize() @@ -68,13 +67,9 @@ def __init__(self, name: str, kind: str, deterministic: bool = True): self._fst = None self.deterministic = deterministic - self.far_path = Path( - os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far" - ) + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") if self.far_exist(): - self._fst = Far( - self.far_path, mode="r", arc_type="standard", far_type="default" - ).get_fst() + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() def far_exist(self) -> bool: """ diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py index f5fc265ab..aaf30b32c 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py @@ -16,11 +16,19 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - NEMO_ALPHA_HE, GraphFst, delete_and, delete_optional_and) -from nemo_text_processing.inverse_text_normalization.he.utils import \ - get_abs_path + NEMO_ALPHA_HE, + GraphFst, + delete_and, + delete_optional_and, +) +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, NEMO_SIGMA, NEMO_SPACE, delete_space, insert_space) + NEMO_DIGIT, + NEMO_SIGMA, + NEMO_SPACE, + delete_space, + insert_space, +) from nemo_text_processing.text_normalization.en.utils import load_labels @@ -47,14 +55,10 @@ def __init__(self): ) graph_two_digit = pynini.union(graph_teen, graph_ties) - self.graph_two_digit = pynini.union( - graph_digit, graph_ties, pynutil.add_weight(graph_teen, -0.001) - ) + self.graph_two_digit = pynini.union(graph_digit, graph_ties, pynutil.add_weight(graph_teen, -0.001)) # hundreds - hundred_exception = pynini.string_file( - get_abs_path("data/numbers/hundreds_exception.tsv") - ) + hundred_exception = pynini.string_file(get_abs_path("data/numbers/hundreds_exception.tsv")) delete_hundred = pynutil.delete("מאות") graph_hundred = delete_optional_and + pynini.union( hundred_exception, @@ -77,9 +81,7 @@ def __init__(self): ) # thousands - thousand_exception = pynini.string_file( - get_abs_path("data/numbers/thousands_exception.tsv") - ) + thousand_exception = pynini.string_file(get_abs_path("data/numbers/thousands_exception.tsv")) thousand_digit = pynini.string_file(get_abs_path("data/numbers/thousands.tsv")) delete_thousand = pynutil.delete("אלפים") | pynutil.delete("אלף", weight=0.001) @@ -95,48 +97,30 @@ def __init__(self): pynutil.insert("000", weight=0.001), ) - self.graph_thousands = pynini.union( - graph_thousands + delete_space + graph_hundred, graph_zero - ) + self.graph_thousands = pynini.union(graph_thousands + delete_space + graph_hundred, graph_zero) self.graph_thousands @= pynini.union( - pynutil.delete(pynini.closure("0")) - + pynini.difference(NEMO_DIGIT, "0") - + pynini.closure(NEMO_DIGIT), + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0", ) # millions - million_exceptions = pynini.string_file( - get_abs_path("data/numbers/millions_exception.tsv") - ) + million_exceptions = pynini.string_file(get_abs_path("data/numbers/millions_exception.tsv")) million_exceptions = pynutil.insert("00") + million_exceptions - delete_millions = pynutil.delete("מיליונים") | pynutil.delete( - "מיליון", weight=0.001 - ) + delete_millions = pynutil.delete("מיליונים") | pynutil.delete("מיליון", weight=0.001) many_millions = large_number_prefix + delete_space + delete_millions - graph_millions = pynini.union( - many_millions, million_exceptions, pynutil.insert("000", weight=0.001) - ) + graph_millions = pynini.union(many_millions, million_exceptions, pynutil.insert("000", weight=0.001)) graph = pynini.union( - graph_millions - + delete_space - + graph_thousands - + delete_space - + graph_hundred, + graph_millions + delete_space + graph_thousands + delete_space + graph_hundred, graph_zero, ) graph = graph @ pynini.union( - pynutil.delete(pynini.closure("0")) - + pynini.difference(NEMO_DIGIT, "0") - + pynini.closure(NEMO_DIGIT), + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0", ) labels_exception = load_labels(get_abs_path("data/numbers/digit.tsv")) - labels_exception = list( - set([x[0] for x in labels_exception] + ["אפס", "עשר", "עשרה"]) - ) + labels_exception = list(set([x[0] for x in labels_exception] + ["אפס", "עשר", "עשרה"])) labels_exception += ["ו" + label for label in labels_exception] graph_exception = pynini.union(*labels_exception).optimize() graph = ((NEMO_ALPHA_HE + NEMO_SIGMA) @ graph).optimize() @@ -144,51 +128,31 @@ def __init__(self): self.graph_no_exception = graph ### Token insertion - minus_graph = ( - pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE - ) + minus_graph = pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE optional_minus_graph = pynini.closure(minus_graph, 0, 1) optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') - + prefix_graph - + pynutil.insert('"') - + insert_space, + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, 0, 1, ) - graph_wo_small_digits = ( - pynini.project(graph, "input") - graph_exception.arcsort() - ) @ graph + graph_wo_small_digits = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph - cardinal_wo_viable_hours = load_labels( - get_abs_path("data/numbers/viable_hours.tsv") - ) + cardinal_wo_viable_hours = load_labels(get_abs_path("data/numbers/viable_hours.tsv")) cardinal_wo_viable_hours = list(set([x[0] for x in cardinal_wo_viable_hours])) viable_hours_exception = pynini.union(*cardinal_wo_viable_hours).optimize() - self.graph_wo_viable_hours = ( - pynini.project(graph, "input") - viable_hours_exception.arcsort() - ) @ graph + self.graph_wo_viable_hours = (pynini.project(graph, "input") - viable_hours_exception.arcsort()) @ graph small_number_with_minus = ( - insert_space - + minus_graph - + pynutil.insert('integer: "') - + self.graph_no_exception - + pynutil.insert('"') + insert_space + minus_graph + pynutil.insert('integer: "') + self.graph_no_exception + pynutil.insert('"') ) big_number_with_optional_minus = ( - optional_minus_graph - + pynutil.insert('integer: "') - + graph_wo_small_digits - + pynutil.insert('"') + optional_minus_graph + pynutil.insert('integer: "') + graph_wo_small_digits + pynutil.insert('"') ) - graph = optional_prefix_graph + ( - small_number_with_minus | big_number_with_optional_minus - ) + graph = optional_prefix_graph + (small_number_with_minus | big_number_with_optional_minus) final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py index 5c0b16360..cf9cacbd5 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py @@ -15,12 +15,9 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst -from nemo_text_processing.inverse_text_normalization.he.utils import \ - get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import ( - delete_extra_space, delete_space, insert_space) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, insert_space def _get_year_graph(graph_two_digits, graph_thousands): @@ -58,42 +55,23 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): day_graph = pynutil.insert('day: "') + day_graph + pynutil.insert('"') month_names = pynini.string_file(get_abs_path("data/months.tsv")) - month_names_graph = ( - pynutil.insert('month: "') + month_names + pynutil.insert('"') - ) + month_names_graph = pynutil.insert('month: "') + month_names + pynutil.insert('"') - month_name2number = pynini.string_file( - get_abs_path("data/months_name2number.tsv") - ) - month_name2number_graph = ( - pynutil.insert('month: "') + month_name2number + pynutil.insert('"') - ) + month_name2number = pynini.string_file(get_abs_path("data/months_name2number.tsv")) + month_name2number_graph = pynutil.insert('month: "') + month_name2number + pynutil.insert('"') - month_number2number = pynini.string_file( - get_abs_path("data/months_ordinal2number.tsv") - ) - month_number2number_graph = ( - pynutil.insert('month: "') + month_number2number + pynutil.insert('"') - ) + month_number2number = pynini.string_file(get_abs_path("data/months_ordinal2number.tsv")) + month_number2number_graph = pynutil.insert('month: "') + month_number2number + pynutil.insert('"') all_month_graph = month_name2number_graph | month_number2number_graph year_graph = _get_year_graph(two_digits_graph, cardinal.graph_thousands) - graph_year = ( - delete_extra_space - + pynutil.insert('year: "') - + year_graph - + pynutil.insert('"') - ) + graph_year = delete_extra_space + pynutil.insert('year: "') + year_graph + pynutil.insert('"') prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) delete_prefix = pynutil.delete(prefix_graph) - graph_prefix = ( - pynutil.insert('morphosyntactic_features: "') - + prefix_graph - + pynutil.insert('"') - ) + graph_prefix = pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') year_prefix_graph = ( pynutil.insert('morphosyntactic_features: "') + pynini.closure(prefix_graph, 0, 1) @@ -120,11 +98,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): + graph_year ) - graph_my = ( - pynini.closure(graph_prefix + insert_space, 0, 1) - + month_names_graph - + graph_year - ) + graph_my = pynini.closure(graph_prefix + insert_space, 0, 1) + month_names_graph + graph_year graph_y_only = year_prefix_graph + graph_year final_graph = graph_dm | graph_dmy | graph_my | graph_y_only diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py index 253f78f88..ecefb306a 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py @@ -15,18 +15,18 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - MINUS, GraphFst, delete_and) -from nemo_text_processing.inverse_text_normalization.he.utils import \ - get_abs_path +from nemo_text_processing.inverse_text_normalization.he.graph_utils import MINUS, GraphFst, delete_and +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, delete_extra_space, delete_space, delete_zero_or_one_space, - insert_space) + NEMO_DIGIT, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) -def get_quantity( - decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike" -) -> "pynini.FstLike": +def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike": """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral in Hebrew, @@ -35,9 +35,7 @@ def get_quantity( cardinal_up_to_hundred: cardinal FST """ numbers = cardinal_up_to_hundred @ ( - pynutil.delete(pynini.closure("0")) - + pynini.difference(NEMO_DIGIT, "0") - + pynini.closure(NEMO_DIGIT) + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) ) suffix_labels = ["מיליון", "מיליארד"] @@ -52,13 +50,7 @@ def get_quantity( + suffix + pynutil.insert('"') ) - res |= ( - decimal - + delete_extra_space - + pynutil.insert('quantity: "') - + (suffix | "אלף") - + pynutil.insert('"') - ) + res |= decimal + delete_extra_space + pynutil.insert('quantity: "') + (suffix | "אלף") + pynutil.insert('"') return res @@ -79,10 +71,7 @@ def __init__(self, cardinal: GraphFst): prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') - + prefix_graph - + pynutil.insert('"') - + insert_space, + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, 0, 1, ) @@ -91,29 +80,15 @@ def __init__(self, cardinal: GraphFst): cardinal_graph = cardinal.graph_no_exception # all fractions - fractions = pynini.string_file( - get_abs_path("data/numbers/decimal_fractions.tsv") - ) + fractions = pynini.string_file(get_abs_path("data/numbers/decimal_fractions.tsv")) fractions_graph = delete_zero_or_one_space + delete_and + fractions - fractions_graph = ( - pynutil.insert('fractional_part: "') + fractions_graph + pynutil.insert('"') - ) + fractions_graph = pynutil.insert('fractional_part: "') + fractions_graph + pynutil.insert('"') # identify decimals that can be understood as time, don't convert them to avoid ambiguity - viable_minutes_exception = pynini.string_file( - get_abs_path("data/decimals/minutes_exception.tsv") - ) - fractions_wo_minutes = ( - pynini.project(fractions, "input") - viable_minutes_exception.arcsort() - ) @ fractions - fractions_wo_minutes = ( - delete_zero_or_one_space + delete_and + fractions_wo_minutes - ) - fractions_wo_minutes = ( - pynutil.insert('fractional_part: "') - + fractions_wo_minutes - + pynutil.insert('"') - ) + viable_minutes_exception = pynini.string_file(get_abs_path("data/decimals/minutes_exception.tsv")) + fractions_wo_minutes = (pynini.project(fractions, "input") - viable_minutes_exception.arcsort()) @ fractions + fractions_wo_minutes = delete_zero_or_one_space + delete_and + fractions_wo_minutes + fractions_wo_minutes = pynutil.insert('fractional_part: "') + fractions_wo_minutes + pynutil.insert('"') graph_decimal = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_decimal |= cardinal.graph_two_digit @@ -122,28 +97,18 @@ def __init__(self, cardinal: GraphFst): point = pynutil.delete("נקודה") - graph_negative = ( - pynutil.insert("negative: ") - + pynini.cross(MINUS, '"true"') - + delete_extra_space - ) + graph_negative = pynutil.insert("negative: ") + pynini.cross(MINUS, '"true"') + delete_extra_space optional_graph_negative = pynini.closure( graph_negative, 0, 1, ) - graph_integer = ( - pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') - ) - graph_fractional = ( - pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"') - ) + graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + graph_fractional = pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"') # integer could be an hour, but minutes cannot: convert to decimal - viable_hour_unviable_minutes = ( - graph_integer + delete_extra_space + fractions_wo_minutes - ) + viable_hour_unviable_minutes = graph_integer + delete_extra_space + fractions_wo_minutes # integer cannot be an hour, but minutes can: convert to decimal unviable_hour_viable_minutes = ( @@ -155,33 +120,22 @@ def __init__(self, cardinal: GraphFst): ) # minus sign followed by ambiguous decimal: convert to decimal, there is no negative time - negative_viable_time = ( - graph_negative + graph_integer + delete_extra_space + fractions_graph - ) + negative_viable_time = graph_negative + graph_integer + delete_extra_space + fractions_graph # all decimals with fractions, not excluding anything (used in other FSTs) all_decimals_wo_point = graph_integer + delete_extra_space + fractions_graph # only cases with fractional part that cannot be interpreted as time - graph_wo_point = ( - viable_hour_unviable_minutes - | unviable_hour_viable_minutes - | negative_viable_time - ) + graph_wo_point = viable_hour_unviable_minutes | unviable_hour_viable_minutes | negative_viable_time # all decimals with the word "point" graph_w_point = ( - pynini.closure(graph_integer + delete_extra_space, 0, 1) - + point - + delete_extra_space - + graph_fractional + pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional ) final_graph_wo_sign = graph_w_point | graph_wo_point self.final_graph_wo_sign = graph_w_point | all_decimals_wo_point - final_graph = ( - optional_prefix_graph + optional_graph_negative + final_graph_wo_sign - ) + final_graph = optional_prefix_graph + optional_graph_negative + final_graph_wo_sign quantity_graph = get_quantity(self.final_graph_wo_sign, cardinal.graph_hundred) final_graph |= optional_prefix_graph + optional_graph_negative + quantity_graph diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py index e849e5a80..0232c4ff6 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py @@ -15,17 +15,17 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import \ - CardinalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import \ - DecimalFst -from nemo_text_processing.inverse_text_normalization.he.utils import \ - get_abs_path +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_SPACE, delete_extra_space, delete_space, delete_zero_or_one_space, - insert_space) + NEMO_SPACE, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) class MeasureFst(GraphFst): @@ -55,10 +55,7 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') - + prefix_graph - + pynutil.insert('"') - + insert_space, + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, 0, 1, ) @@ -92,9 +89,7 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): spaced_units = pynini.string_file(get_abs_path("data/spaced_measurements.tsv")) spaced_units = pynini.invert(spaced_units) - spaced_units = ( - pynutil.insert('units: "\[SPACE\]') + spaced_units + pynutil.insert('"') - ) # noqa: W605 + spaced_units = pynutil.insert('units: "\[SPACE\]') + spaced_units + pynutil.insert('"') # noqa: W605 # in joint units the unit is concatenated to the number, in spaced unit separate the unit with a space units_graph = joined_units | spaced_units @@ -112,12 +107,8 @@ def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): ) number_graph = subgraph_decimal | subgraph_cardinal - number_unit_graph = (number_graph + units_graph) | ( - units_graph + delete_space + one_graph - ) + number_unit_graph = (number_graph + units_graph) | (units_graph + delete_space + one_graph) - final_graph = ( - optional_prefix_graph + number_unit_graph + delete_zero_or_one_space - ) + final_graph = optional_prefix_graph + number_unit_graph + delete_zero_or_one_space final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py index c61494350..c7306ea43 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py @@ -15,10 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst -from nemo_text_processing.inverse_text_normalization.he.utils import \ - get_abs_path +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py index a0db2eec1..b963e7b74 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py @@ -15,8 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst class PunctuationFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py index 07c6c8132..ac4965cfc 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py @@ -15,15 +15,16 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - GraphFst, delete_and) -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import \ - CardinalFst -from nemo_text_processing.inverse_text_normalization.he.utils import ( - get_abs_path, integer_to_text) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, delete_and +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path, integer_to_text from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, delete_extra_space, delete_space, delete_zero_or_one_space, - insert_space) + NEMO_DIGIT, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) class TimeFst(GraphFst): @@ -43,18 +44,12 @@ def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period - midnight_to_hour_graph = pynini.string_file( - get_abs_path("data/time/midnight_to_hour.tsv") - ) + midnight_to_hour_graph = pynini.string_file(get_abs_path("data/time/midnight_to_hour.tsv")) to_hour_graph = pynini.string_file(get_abs_path("data/time/to_hour.tsv")) - minute_verbose_graph = pynini.string_file( - get_abs_path("data/time/minute_verbose.tsv") - ) + minute_verbose_graph = pynini.string_file(get_abs_path("data/time/minute_verbose.tsv")) minute_to_graph = pynini.string_file(get_abs_path("data/time/minute_to.tsv")) - minute_to_verbose_graph = pynini.string_file( - get_abs_path("data/time/minute_to_verbose.tsv") - ) + minute_to_verbose_graph = pynini.string_file(get_abs_path("data/time/minute_to_verbose.tsv")) suffix_graph = pynini.union( pynini.string_file(get_abs_path("data/time/day_suffix.tsv")), @@ -65,10 +60,7 @@ def __init__(self): time_prefix = pynini.string_file(get_abs_path("data/prefix.tsv")) time_prefix_graph = ( - pynutil.insert('morphosyntactic_features: "') - + time_prefix - + pynutil.insert('"') - + insert_space + pynutil.insert('morphosyntactic_features: "') + time_prefix + pynutil.insert('"') + insert_space ) optional_time_prefix_graph = pynini.closure(time_prefix_graph, 0, 1) @@ -76,45 +68,31 @@ def __init__(self): cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [integer_to_text(x, only_fem=True)[0] for x in range(1, 13)] - labels_minute_single = [ - integer_to_text(x, only_fem=True)[0] for x in range(2, 10) - ] - labels_minute_double = [ - integer_to_text(x, only_fem=True)[0] for x in range(10, 60) - ] + labels_minute_single = [integer_to_text(x, only_fem=True)[0] for x in range(2, 10)] + labels_minute_double = [integer_to_text(x, only_fem=True)[0] for x in range(10, 60)] graph_hour = pynini.union(*labels_hour) @ cardinal graph_hour |= midnight_to_hour_graph add_leading_zero_to_double_digit = pynutil.insert("0") + NEMO_DIGIT - graph_minute_single = ( - pynini.union(*labels_minute_single) - @ cardinal - @ add_leading_zero_to_double_digit - ) + graph_minute_single = pynini.union(*labels_minute_single) @ cardinal @ add_leading_zero_to_double_digit graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"') - graph_minute = pynini.union( - pynutil.insert("00"), graph_minute_single, graph_minute_double - ) + graph_minute = pynini.union(pynutil.insert("00"), graph_minute_single, graph_minute_double) final_suffix = pynutil.insert('suffix: "') + suffix_graph + pynutil.insert('"') final_suffix = delete_space + insert_space + final_suffix time_word = "דקות" - optional_delete_time = pynini.closure( - delete_space + pynutil.delete(time_word), 0, 1 - ) + optional_delete_time = pynini.closure(delete_space + pynutil.delete(time_word), 0, 1) graph_h_and_m = ( final_graph_hour + delete_space + delete_and + insert_space + pynutil.insert('minutes: "') - + pynini.union( - graph_minute_single, graph_minute_double, minute_verbose_graph - ) + + pynini.union(graph_minute_single, graph_minute_double, minute_verbose_graph) + pynutil.insert('"') + optional_delete_time ) @@ -175,9 +153,7 @@ def __init__(self): + delete_and + insert_space + pynutil.insert('minutes: "') - + pynini.union( - graph_minute_single, graph_minute_double, minute_verbose_graph - ) + + pynini.union(graph_minute_single, graph_minute_double, minute_verbose_graph) + pynutil.insert('"') + optional_delete_time ) @@ -210,22 +186,13 @@ def __init__(self): final_graph_midnight = ( optional_time_prefix_graph + delete_zero_or_one_space - + ( - midnight_graph - | to_midnight_verbose_graph - | graph_m_to_midnight - | graph_midnight_and_m - ) + + (midnight_graph | to_midnight_verbose_graph | graph_m_to_midnight | graph_midnight_and_m) ) final_graph = ( optional_time_prefix_graph + delete_zero_or_one_space - + ( - graph_h_and_m - | graph_special_m_to_h_suffix_time - | graph_m_to_h_suffix_time - ) + + (graph_h_and_m | graph_special_m_to_h_suffix_time | graph_m_to_h_suffix_time) + final_suffix ) final_graph |= graph_h diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py index 361b8d007..807dcf734 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py @@ -18,28 +18,17 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst -from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import \ - CardinalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.date import \ - DateFst -from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import \ - DecimalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.measure import \ - MeasureFst -from nemo_text_processing.inverse_text_normalization.he.taggers.ordinal import \ - OrdinalFst -from nemo_text_processing.inverse_text_normalization.he.taggers.punctuation import \ - PunctuationFst -from nemo_text_processing.inverse_text_normalization.he.taggers.time import \ - TimeFst -from nemo_text_processing.inverse_text_normalization.he.taggers.whitelist import \ - WhiteListFst -from nemo_text_processing.inverse_text_normalization.he.taggers.word import \ - WordFst -from nemo_text_processing.text_normalization.en.graph_utils import ( - delete_extra_space, delete_space, generator_main) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.he.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.he.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.he.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.he.taggers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, generator_main class ClassifyFst(GraphFst): @@ -99,21 +88,13 @@ def __init__( | pynutil.add_weight(word_graph, 100) ) - punct = ( - pynutil.insert("tokens { ") - + pynutil.add_weight(punct_graph, weight=1.1) - + pynutil.insert(" }") - ) + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) - + token - + pynini.closure(pynutil.insert(" ") + punct) + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) - graph = token_plus_punct + pynini.closure( - delete_extra_space + token_plus_punct - ) + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py index 2273a6435..58de7668e 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py @@ -17,12 +17,9 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - GraphFst, string_map_cased) -from nemo_text_processing.inverse_text_normalization.he.utils import \ - get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import ( - convert_space, insert_space) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, string_map_cased +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import convert_space, insert_space class WhiteListFst(GraphFst): @@ -48,16 +45,11 @@ def __init__(self, input_file: str = None): raise ValueError(f"Whitelist file {input_file} not found") optional_prefix_graph = pynini.closure( - pynutil.insert('morphosyntactic_features: "') - + prefix_graph - + pynutil.insert('"') - + insert_space, + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, 0, 1, ) whitelist = string_map_cased(input_file) - graph = ( - pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"') - ) + graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"') final_graph = optional_prefix_graph + graph self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py index 6c0ee4fd9..6b5394ac3 100644 --- a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py @@ -15,10 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import \ - NEMO_NOT_SPACE +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE class WordFst(GraphFst): @@ -29,9 +27,5 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = ( - pynutil.insert('name: "') - + pynini.closure(NEMO_NOT_SPACE, 1) - + pynutil.insert('"') - ) + word = pynutil.insert('name: "') + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert('"') self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/utils.py b/nemo_text_processing/inverse_text_normalization/he/utils.py index 3348c9f86..1aa996b80 100644 --- a/nemo_text_processing/inverse_text_normalization/he/utils.py +++ b/nemo_text_processing/inverse_text_normalization/he/utils.py @@ -177,8 +177,6 @@ def _less_than_100(num, only_fem=False): else: res.append(f'{tens_dict[num[0]]} {"ו"}{units_feminine_dict[num[1]]}') if not only_fem: - res.append( - f'{tens_dict[num[0]]} {"ו"}{units_masculine_dict[num[1]]}' - ) + res.append(f'{tens_dict[num[0]]} {"ו"}{units_masculine_dict[num[1]]}') return res diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py index 11a14695b..d26e1f703 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py @@ -15,10 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - NEMO_ALPHA_HE, GraphFst) -from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, delete_space) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space class CardinalFst(GraphFst): @@ -38,10 +36,7 @@ def __init__(self): at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # Thousands separator - group_by_threes = ( - at_most_three_digits - + (pynutil.insert(",") + exactly_three_digits).closure() - ) + group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() # Keep the prefix if exists and add a dash optional_prefix = pynini.closure( diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py index 9379fbf50..4a1b24599 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py @@ -15,11 +15,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_NOT_QUOTE, NEMO_SPACE, delete_space, delete_zero_or_one_space, - insert_space) + NEMO_NOT_QUOTE, + NEMO_SPACE, + delete_space, + delete_zero_or_one_space, + insert_space, +) class DateFst(GraphFst): @@ -98,13 +101,7 @@ def __init__(self): ) # day month and year - graph_dmy = ( - graph_dm - + delete_space - + pynutil.insert(".") - + delete_zero_or_one_space - + year - ) + graph_dmy = graph_dm + delete_space + pynutil.insert(".") + delete_zero_or_one_space + year # only month and year graph_my = ( diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py index d4d3862d2..ea69ab784 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py @@ -15,10 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - NEMO_ALPHA_HE, GraphFst) -from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, NEMO_NOT_QUOTE, delete_space) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_NOT_QUOTE, delete_space class DecimalFst(GraphFst): @@ -32,19 +30,14 @@ class DecimalFst(GraphFst): def __init__(self): super().__init__(name="decimal", kind="verbalize") - optionl_sign = pynini.closure( - pynini.cross('negative: "true"', "-") + delete_space, 0, 1 - ) + optionl_sign = pynini.closure(pynini.cross('negative: "true"', "-") + delete_space, 0, 1) # Need parser to group digits by threes exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # Thousands separator - group_by_threes = ( - at_most_three_digits - + (pynutil.insert(",") + exactly_three_digits).closure() - ) + group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() integer = ( pynutil.delete("integer_part:") @@ -75,9 +68,7 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') ) - optional_quantity = pynini.closure( - pynutil.insert(" ") + quantity + delete_space, 0, 1 - ) + optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1) # Keep the prefix if exists and add a dash optional_prefix = pynini.closure( @@ -92,9 +83,7 @@ def __init__(self): 1, ) - graph = ( - optional_prefix + optional_integer + optional_fractional + optional_quantity - ) + graph = optional_prefix + optional_integer + optional_fractional + optional_quantity self.numbers = graph graph = optionl_sign + graph delete_tokens = self.delete_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py index c6bcb9ecb..a4aadd67b 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py @@ -15,10 +15,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_CHAR, NEMO_NOT_QUOTE, NEMO_SIGMA, NEMO_SPACE, delete_space) + NEMO_CHAR, + NEMO_NOT_QUOTE, + NEMO_SIGMA, + NEMO_SPACE, + delete_space, +) class MeasureFst(GraphFst): @@ -96,12 +100,7 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst): numbers_units = delete_space + unit numbers_graph = (graph_cardinal | graph_decimal) + numbers_units - one_graph = ( - delete_space - + pynutil.insert("1") - + unit - + pynutil.delete('cardinal { integer: "1" }') - ) + one_graph = delete_space + pynutil.insert("1") + unit + pynutil.delete('cardinal { integer: "1" }') graph = optional_prefix + (numbers_graph | one_graph) delete_tokens = self.delete_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py index fb63ba0aa..a85f5b019 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py @@ -15,10 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, delete_space) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py index 9d412a1a8..3d41b783b 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py @@ -15,13 +15,15 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst -from nemo_text_processing.inverse_text_normalization.he.utils import \ - get_abs_path +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_DIGIT, NEMO_NOT_QUOTE, delete_space, delete_zero_or_one_space, - insert_space) + NEMO_DIGIT, + NEMO_NOT_QUOTE, + delete_space, + delete_zero_or_one_space, + insert_space, +) class TimeFst(GraphFst): @@ -38,44 +40,20 @@ def __init__(self): super().__init__(name="time", kind="verbalize") hour_to_noon = pynini.string_file(get_abs_path("data/time/hour_to_noon.tsv")) - hour_to_evening = pynini.string_file( - get_abs_path("data/time/hour_to_evening.tsv") - ) + hour_to_evening = pynini.string_file(get_abs_path("data/time/hour_to_evening.tsv")) hour_to_night = pynini.string_file(get_abs_path("data/time/hour_to_night.tsv")) day_suffixes = pynini.string_file(get_abs_path("data/time/day_suffix.tsv")) - day_suffixes = ( - insert_space - + pynutil.delete('suffix: "') - + day_suffixes - + pynutil.delete('"') - ) + day_suffixes = insert_space + pynutil.delete('suffix: "') + day_suffixes + pynutil.delete('"') noon_suffixes = pynini.string_file(get_abs_path("data/time/noon_suffix.tsv")) - noon_suffixes = ( - insert_space - + pynutil.delete('suffix: "') - + noon_suffixes - + pynutil.delete('"') - ) + noon_suffixes = insert_space + pynutil.delete('suffix: "') + noon_suffixes + pynutil.delete('"') - evening_suffixes = pynini.string_file( - get_abs_path("data/time/evening_suffix.tsv") - ) - evening_suffixes = ( - insert_space - + pynutil.delete('suffix: "') - + evening_suffixes - + pynutil.delete('"') - ) + evening_suffixes = pynini.string_file(get_abs_path("data/time/evening_suffix.tsv")) + evening_suffixes = insert_space + pynutil.delete('suffix: "') + evening_suffixes + pynutil.delete('"') night_suffixes = pynini.string_file(get_abs_path("data/time/night_suffix.tsv")) - night_suffixes = ( - insert_space - + pynutil.delete('suffix: "') - + night_suffixes - + pynutil.delete('"') - ) + night_suffixes = insert_space + pynutil.delete('suffix: "') + night_suffixes + pynutil.delete('"') hour = ( pynutil.delete("hours:") @@ -110,14 +88,7 @@ def __init__(self): [hour_to_noon, hour_to_evening, hour_to_night], [noon_suffixes, evening_suffixes, night_suffixes], ): - graph |= ( - hour @ hour_to - + delete_space - + pynutil.insert(":") - + minute - + delete_space - + suffix - ) + graph |= hour @ hour_to + delete_space + pynutil.insert(":") + minute + delete_space + suffix graph |= optional_prefix + graph delete_tokens = self.delete_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py index bd2a39c52..0223259db 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py @@ -12,22 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.cardinal import \ - CardinalFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.date import \ - DateFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.decimal import \ - DecimalFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.measure import \ - MeasureFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.ordinal import \ - OrdinalFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.time import \ - TimeFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.whitelist import \ - WhiteListFst +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.whitelist import WhiteListFst class VerbalizeFst(GraphFst): @@ -57,12 +49,6 @@ def __init__(self): whitelist_graph = WhiteListFst().fst graph = ( - time_graph - | date_graph - | measure_graph - | ordinal_graph - | decimal_graph - | cardinal_graph - | whitelist_graph + time_graph | date_graph | measure_graph | ordinal_graph | decimal_graph | cardinal_graph | whitelist_graph ) self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py index 261d11092..611181df4 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py @@ -15,14 +15,10 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize import \ - VerbalizeFst -from nemo_text_processing.inverse_text_normalization.he.verbalizers.word import \ - WordFst -from nemo_text_processing.text_normalization.en.graph_utils import ( - delete_extra_space, delete_space) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space class VerbalizeFinalFst(GraphFst): @@ -44,10 +40,5 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + delete_space + pynutil.delete("}") ) - graph = ( - delete_space - + pynini.closure(graph + delete_extra_space) - + graph - + delete_space - ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py index 5cb519c14..0607e0b37 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py @@ -15,10 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( - NEMO_ALPHA_HE, GraphFst) -from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_CHAR, NEMO_SIGMA, delete_space) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, delete_space class WhiteListFst(GraphFst): @@ -47,8 +45,6 @@ def __init__(self): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete('"') ) - graph = graph @ pynini.cdrewrite( - pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA - ) + graph = graph @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) final_graph = optional_prefix + graph self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py index 858ca8d60..49c61cf6a 100644 --- a/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py @@ -15,10 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.he.graph_utils import \ - GraphFst -from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_CHAR, NEMO_SIGMA, delete_space) +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, delete_space class WordFst(GraphFst): @@ -30,13 +28,7 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) - char = ( - pynutil.delete("name:") - + delete_space - + pynutil.delete('"') - + chars - + pynutil.delete('"') - ) + char = pynutil.delete("name:") + delete_space + pynutil.delete('"') + chars + pynutil.delete('"') graph = char @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize()