From 2d80ba5cf8bcb087e0069a0bb5562ec636a476df Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Tue, 22 Apr 2025 22:02:28 +0530 Subject: [PATCH 01/16] Future Implementations for classes - Measure, Money, and Date (#258) * Future Implementations for classes - Measure, Money, and Date Signed-off-by: Namrata Gachchi * Resolved the conflicts with mm_yyyy and date ranges and added the previously removed failing test cases. Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed the unused empty string implementation Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fixes for the tagger files Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * reformatted decimal final graph Signed-off-by: Namrata Gachchi * incorporated the suggestion for decimal graph Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Century implementations Signed-off-by: Namrata Gachchi * Working on the yyyy format for the date class Signed-off-by: Namrata Gachchi * reverted yyyy code Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on future implementations Signed-off-by: Namrata Gachchi * working on improving the date class accuracy Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added year prefix for the date class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on the commma cases for date class Signed-off-by: Namrata Gachchi * minor fixes Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * implemented mixed fractions Signed-off-by: Namrata Gachchi * rectified the test case Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on quarterly measurements Signed-off-by: Namrata Gachchi * reformatted the prefixes and suffixes for date tagger class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * replaced text tag with era tag for the date class Signed-off-by: Namrata Gachchi * Removed the text tag reference from date class verbalizer Signed-off-by: Namrata Gachchi --------- Signed-off-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Jenkinsfile | 2 +- .../hi/data/date/prefixes.tsv | 3 + .../hi/data/date/suffixes.tsv | 10 +++ .../hi/data/date/year_suffix.tsv | 2 + .../hi/data/measure/quarterly_units.tsv | 12 +++ .../hi/data/measure/unit.tsv | 4 +- .../hi/data/money/currency.tsv | 3 +- .../hi/data/money/major_minor_currencies.tsv | 9 +++ .../hi/data/numbers/teens_and_ties.tsv | 16 ++-- .../text_normalization/hi/data/time/hours.tsv | 1 + .../text_normalization/hi/taggers/cardinal.py | 3 + .../text_normalization/hi/taggers/date.py | 57 ++++++++++++- .../text_normalization/hi/taggers/measure.py | 75 ++++++++++++++++-- .../text_normalization/hi/taggers/money.py | 36 +++++---- .../hi/taggers/tokenize_and_classify.py | 8 +- .../text_normalization/hi/verbalizers/date.py | 4 +- .../hi/verbalizers/fraction.py | 7 +- .../hi/verbalizers/money.py | 79 ++++++++++++++----- .../hi/verbalizers/verbalize.py | 20 +++-- .../hi/verbalizers/whitelist.py | 2 + .../test_cases_date.txt | 15 ++++ .../test_cases_fraction.txt | 4 +- .../test_cases_measure.txt | 4 + .../test_cases_money.txt | 20 ++++- 24 files changed, 324 insertions(+), 72 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv diff --git a/Jenkinsfile b/Jenkinsfile index c94c107c6..53c784920 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-12-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv new file mode 100644 index 000000000..d4c1ca0b1 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv @@ -0,0 +1,3 @@ +सन् +सन +साल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv new file mode 100644 index 000000000..6806d3f12 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv @@ -0,0 +1,10 @@ + में + का + की + के + से + तक + ईस्वी + शताब्दी + दशक + सदी \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv new file mode 100644 index 000000000..acb37d534 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv @@ -0,0 +1,2 @@ +ई. पू. ईसा पूर्व +ई. ईसवी \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv new file mode 100644 index 000000000..eaddf930a --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv @@ -0,0 +1,12 @@ +s सेकंड +hr घंटा +h घंटे +min मिनट +doz दर्जन +yr साल +yr वर्ष +hp हॉर्सपॉवर +d दिन +month महीना +months महीने +हफ़्ते हफ़्ते \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv index 0bf561379..189512687 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv @@ -141,14 +141,16 @@ month महीना months महीने ct कैरेट pH पीएच +km/h किलोमीटर प्रति घंटा km/hr किलोमीटर प्रति घंटा km/min किलोमीटर प्रति मिनट +m/h मीटर प्रति घंटा m/hr मीटर प्रति घंटा mi/s मील प्रति सेकंड +mi/h मील प्रति घंटा mi/hr मील प्रति घंटा mi/min मील प्रति मिनट ₹/ac रुपए प्रति एकड़ x बाई X बाई * बाई -- से diff --git a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv index 88633ec7c..8f4a955cc 100644 --- a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv +++ b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv @@ -1,5 +1,4 @@ ₹ रुपए -P पैसे £ पाउंड ₩ वॉन $ डॉलर @@ -7,4 +6,4 @@ $ डॉलर ৳ टका ¥ येन ₦ नाइरा -€ यूरो +€ यूरो \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv new file mode 100644 index 000000000..cf62891d1 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv @@ -0,0 +1,9 @@ +रुपए पैसे +पाउंड पेंस +वॉन जिओन +डॉलर सेंट +लीरा कुरस +टका पैसे +येन सेन +नाइरा कोबो +यूरो सेंट diff --git a/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv b/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv index 1d61c77b7..fbf248266 100644 --- a/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv +++ b/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv @@ -79,12 +79,12 @@ ८८ अट्ठासी ८९ नवासी ९० नब्बे -९१ इक्यानबे -९२ बानबे -९३ तिरानबे -९४ चौरानबे -९५ पंचानबे -९६ छियानबे -९७ सत्तानबे -९८ अट्ठानबे +९१ इक्यानबे +९२ बानबे +९३ तिरानबे +९४ चौरानबे +९५ पंचानबे +९६ छियानबे +९७ सत्तानबे +९८ अट्ठानबे ९९ निन्यानबे diff --git a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv index d5e85a784..dd8623284 100644 --- a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv +++ b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv @@ -1,3 +1,4 @@ +० शून्य १ एक २ दो ३ तीन diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index fe3ad9a1d..05d7a4ee4 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -80,6 +80,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 1, teens_ties) graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 0, graph_hundreds) graph_ten_thousands.optimize() + self.graph_ten_thousands = graph_ten_thousands # Lakhs graph and ten lakhs graph suffix_lakhs = pynutil.insert(" लाख") @@ -90,6 +91,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 1, graph_thousands) graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 0, graph_ten_thousands) graph_lakhs.optimize() + self.graph_lakhs = graph_lakhs graph_ten_lakhs = create_graph_suffix(teens_and_ties, suffix_lakhs, 5) graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 4, digit) @@ -98,6 +100,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 1, graph_thousands) graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 0, graph_ten_thousands) graph_ten_lakhs.optimize() + self.graph_ten_lakhs = graph_ten_lakhs # Crores graph ten crores graph suffix_crores = pynutil.insert(" करोड़") diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 19aaf3139..468753e23 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -26,6 +26,20 @@ days = pynini.string_file(get_abs_path("data/date/days.tsv")) months = pynini.string_file(get_abs_path("data/date/months.tsv")) +year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv")) +digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) +teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + +# Read suffixes from file into a list +with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f: + suffixes_list = f.read().splitlines() +with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f: + prefixes_list = f.read().splitlines() + +# Create union of suffixes and prefixes +suffix_union = pynini.union(*suffixes_list) +prefix_union = pynini.union(*prefixes_list) class DateFst(GraphFst): @@ -51,10 +65,15 @@ def __init__(self, cardinal: GraphFst): (NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand ) + cardinal_graph = ( + digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands + ) + graph_year = graph_year_thousands | graph_year_hundreds_as_thousands delete_dash = pynutil.delete("-") delete_slash = pynutil.delete("/") + delete_comma = pynutil.delete(",") days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space @@ -68,6 +87,22 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd += pynutil.insert(" preserve_order: true ") + # Graph for era + era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space + + range_graph = pynini.cross("-", "से") + + # Graph for year + century_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") + century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space + + # Updated logic to use suffix_union + year_number = graph_year + suffix_union + year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space + + # Updated logic to use prefix_union + year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") + graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph ) @@ -78,7 +113,20 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - graph_mm_yyyy = months_graph + delete_dash + years_graph + graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph + + graph_year_suffix = era_graph + + graph_range = ( + pynutil.insert("era: \"") + + cardinal_graph + + insert_space + + range_graph + + insert_space + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" preserve_order: true ") + ) # default assume dd_mm_yyyy @@ -87,7 +135,12 @@ def __init__(self, cardinal: GraphFst): | graph_mm_dd | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) | graph_mm_dd_yyyy - | graph_mm_yyyy + | pynutil.add_weight(graph_mm_yyyy, -0.2) + | pynutil.add_weight(graph_year_suffix, -0.001) + | pynutil.add_weight(graph_range, -0.005) + | pynutil.add_weight(century_text, -0.001) + | pynutil.add_weight(year_text, -0.001) + | pynutil.add_weight(year_prefix, -0.009) ) self.final_graph = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 7434fd70f..954215771 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -19,6 +19,11 @@ from nemo_text_processing.text_normalization.hi.utils import get_abs_path +digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) +teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + + class MeasureFst(GraphFst): """ Finite state transducer for classifying measure, suppletive aware, e.g. @@ -35,26 +40,55 @@ class MeasureFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") - cardinal_graph = cardinal.final_graph - decimal_graph = decimal.final_graph_wo_negative + cardinal_graph = ( + digit + | teens_and_ties + | cardinal.graph_hundreds + | cardinal.graph_thousands + | cardinal.graph_ten_thousands + | cardinal.graph_lakhs + | cardinal.graph_ten_lakhs + ) + point = pynutil.delete(".") + decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) + quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv")) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, ) + # Define the quarterly measurements + quarter = pynini.string_map([(".५", "साढ़े"), ("१.५", "डेढ़"), ("२.५", "ढाई"),]) + quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"") + # Define the unit handling - self.unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") + unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ") + units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ") - graph_measurements = ( + # Handling symbols like x, X, * + symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),]) + + graph_decimal = ( pynutil.insert("decimal { ") + optional_graph_negative + decimal_graph + pynutil.insert(" }") + delete_space - + self.unit + + unit ) - graph_measurements |= ( + + graph_quarter = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + quarter_graph + + pynutil.insert(" }") + + delete_space + + units + ) + + graph_cardinal = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") @@ -62,10 +96,35 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("\"") + pynutil.insert(" }") + delete_space - + self.unit + + unit ) - graph = graph_measurements + # Handling cardinal clubbed with symbol as single token + graph_exceptions = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynutil.insert(" units: \"") + + symbol_graph + + pynutil.insert("\" ") + + pynutil.insert("} }") + + insert_space + + pynutil.insert("tokens { cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + ) + + graph = ( + pynutil.add_weight(graph_decimal, 0.01) + | pynutil.add_weight(graph_quarter, 0.005) + | pynutil.add_weight(graph_cardinal, 0.01) + | pynutil.add_weight(graph_exceptions, 0.01) + ) self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py index c44d6d346..6d9ac6dcc 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -24,9 +24,11 @@ class MoneyFst(GraphFst): """ Finite state transducer for classifying money, suppletive aware, e.g. - ₹1 -> money { currency: "रुपए" integer_part: "एक" } - ₹1.2 -> money { currency: "रुपए" integer_part: "एक" fractional_part: "दो" } - + ₹५० -> money { money { currency_maj: "रुपए" integer_part: "पचास" } + ₹५०.५० -> money { currency_maj: "रुपए" integer_part: "पचास" fractional_part: "पचास" currency_min: "centiles" } + ₹०.५० -> money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } + Note that the 'centiles' string is a placeholder to handle by the verbalizer by applying the corresponding minor currency denomination + Args: cardinal: CardinalFst decimal: DecimalFst @@ -34,7 +36,7 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self, cardinal: GraphFst): super().__init__(name="money", kind="classify") cardinal_graph = cardinal.final_graph @@ -42,21 +44,25 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, ) - self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") - self.interger = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\" ") - self.fraction = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\" ") + currency_major = pynutil.insert('currency_maj: "') + currency_graph + pynutil.insert('"') + integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + fraction = pynutil.insert('fractional_part: "') + cardinal_graph + pynutil.insert('"') + currency_minor = pynutil.insert('currency_min: "') + pynutil.insert("centiles") + pynutil.insert('"') - graph_currencies = optional_graph_negative + self.currency + insert_space + self.interger - graph_currencies |= ( + graph_major_only = optional_graph_negative + currency_major + insert_space + integer + graph_major_and_minor = ( optional_graph_negative - + self.currency + + currency_major + insert_space - + self.interger - + pynutil.delete(".") + + integer + + pynini.cross(".", " ") + + fraction + insert_space - + self.fraction + + currency_minor ) - graph = graph_currencies - self.graph = graph.optimize() + + graph_currencies = graph_major_only | graph_major_and_minor + + graph = graph_currencies.optimize() final_graph = self.add_tokens(graph) self.fst = final_graph diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 48ee97ef3..bdec90c06 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -43,7 +43,7 @@ class ClassifyFst(GraphFst): Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, @@ -68,11 +68,11 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far" + cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] - logging.info(f'ClassifyFst.fst was restored from {far_file}.') + logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars.") @@ -107,7 +107,7 @@ def __init__( logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") start_time = time.time() - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst(cardinal=cardinal) money_graph = money.fst logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/date.py b/nemo_text_processing/text_normalization/hi/verbalizers/date.py index 1265fcec6..187acf7d6 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/date.py @@ -39,6 +39,8 @@ def __init__(self): year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph_era = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph_dd_mm = day + NEMO_SPACE + month graph_mm_dd = month + NEMO_SPACE + day @@ -60,7 +62,7 @@ def __init__(self): ) self.graph = ( - (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy) + (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy | graph_era) + delete_space + optional_preserve_order ) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index e4cfae302..cba534e61 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -39,10 +39,15 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") denominator = pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") insert_bata = pynutil.insert(" बटा ") + insert_aur = pynutil.insert(" और ") fraction_default = numerator + insert_bata + denominator - self.graph = optional_sign + pynini.closure(pynini.closure(integer, 0, 1) + insert_space) + fraction_default + self.graph = ( + optional_sign + + pynini.closure(pynini.closure(integer, 0, 1) + insert_space + insert_aur) + + fraction_default + ) graph = self.graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/money.py b/nemo_text_processing/text_normalization/hi/verbalizers/money.py index d5cab33d8..048140295 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/money.py @@ -15,14 +15,26 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space +major_minor_currencies = { + "रुपए": "पैसे", + "पाउंड": "पेंस", + "वॉन": "जिओन", + "डॉलर": "सेंट", + "लीरा": "कुरस", + "टका": "पैसे", + "येन": "सेन", + "नाइरा": "कोबो", + "यूरो": "सेंट", +} +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst class MoneyFst(GraphFst): """ Finite state transducer for verbalizing money, e.g. - money { integer_part: "बारह" currency: "रुपए" } -> बारह रुपए - money { integer_part: "बारह" currency: "रुपए" fractional_part: "पचास" currency: "पैसे" } -> बारह रुपए पचास पैसे + money { integer_part: "बारह" currency_maj: "रुपए" } -> बारह रुपए + money { integer_part: "बारह" currency_maj: "रुपए" fractional_part: "पचास" currency_min: "centiles" } -> बारह रुपए पचास पैसे + money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } -> पचास पैसे Args: cardinal: CardinalFst @@ -31,33 +43,58 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self): super().__init__(name="money", kind="verbalize") - insert_paise = pynutil.insert("पैसे") + currency_major = pynutil.delete('currency_maj: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') - currency = ( - pynutil.delete('currency: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) - - integer_part = ( - pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) + integer_part = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') fractional_part = ( - pynutil.delete('fractional_part: "') - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete('" ') - + insert_space + pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') ) - graph_integer = integer_part + delete_space + currency + # Handles major denominations only + graph_major_only = integer_part + pynini.accep(NEMO_SPACE) + currency_major - graph_interger_fraction = ( - integer_part + delete_space + currency + delete_space + fractional_part + delete_space + insert_paise - ) + # Handles both major and minor denominations + major_minor_graphs = [] + + # Handles minor denominations only + minor_graphs = [] + + # Logic for handling minor denominations + for major, minor in major_minor_currencies.items(): + graph_major = pynutil.delete('currency_maj: "') + pynini.accep(major) + pynutil.delete('"') + graph_minor = pynutil.delete('currency_min: "') + pynini.cross("centiles", minor) + pynutil.delete('"') + graph_major_minor_partial = ( + integer_part + + pynini.accep(NEMO_SPACE) + + graph_major + + pynini.accep(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + major_minor_graphs.append(graph_major_minor_partial) + + graph_minor_partial = ( + pynutil.delete('integer_part: "शून्य"') + + pynutil.delete(NEMO_SPACE) + + pynutil.delete('currency_maj: "') + + pynutil.delete(major) + + pynutil.delete('"') + + pynutil.delete(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + minor_graphs.append(graph_minor_partial) + + graph_major_minor = pynini.union(*major_minor_graphs) + graph_minor_only = pynini.union(*minor_graphs) - graph = graph_integer | graph_interger_fraction + graph = graph_major_only | graph_major_minor | pynutil.add_weight(graph_minor_only, -0.1) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index ca06fc9c3..e91f0d9f6 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -20,8 +20,7 @@ from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst - -# from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst class VerbalizeFst(GraphFst): @@ -56,11 +55,20 @@ def __init__(self, deterministic: bool = True): measure = MeasureFst(cardinal=cardinal, decimal=decimal) measure_graph = measure.fst - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst() money_graph = money.fst - # whitelist_graph = WhiteListFst(deterministic=deterministic).fst - - graph = cardinal_graph | decimal_graph | fraction_graph | date_graph | time_graph | measure_graph | money_graph + whitelist_graph = WhiteListFst(deterministic=deterministic).fst + + graph = ( + cardinal_graph + | decimal_graph + | fraction_graph + | date_graph + | time_graph + | measure_graph + | money_graph + | whitelist_graph + ) self.fst = graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py index 3f478a2d2..ed419f2f7 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index d92a53852..a4b3caf07 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -17,3 +17,18 @@ ११-२०२४~नवंबर दो हज़ार चौबीस २०७०~दो हज़ार सत्तर २०२४~दो हज़ार चौबीस +१२० ई. पू.~एक सौ बीस ईसा पूर्व +२९७-२७२ ई. पू.~दो सौ सत्तानबे से दो सौ बहत्तर ईसा पूर्व +३२७वीं सदी~तीन सौ सत्ताईसवीं सदी +१८वीं शताब्दी~अठारहवीं शताब्दी +१९वीं दशक~उन्नीसवीं दशक +१९९९ में~उन्नीस सौ निन्यानबे में +१९९० का~उन्नीस सौ नब्बे का +१९९२ की~उन्नीस सौ बानबे की +१९६० के अभिनेता है~उन्नीस सौ साठ के अभिनेता है +१७८८ से~सत्रह सौ अट्ठासी से +१९५४ तक~उन्नीस सौ चौवन तक +सन १९९९~सन उन्नीस सौ निन्यानबे +सन् १९२०~सन् उन्नीस सौ बीस +साल १९७१~साल उन्नीस सौ इकहत्तर +१९२०-२६ तक~उन्नीस सौ बीस से छब्बीस तक \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt index 25c18b777..d1473412e 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt @@ -1,5 +1,5 @@ ९९/९९~निन्यानबे बटा निन्यानबे -२२ ३१/१७~बाईस इकतीस बटा सत्रह +२२ ३१/१७~बाईस और इकतीस बटा सत्रह ९७/०~सत्तानबे बटा शून्य २५६३/४१२~दो हज़ार पाँच सौ तिरेसठ बटा चार सौ बारह ७२८६०/७०~बहत्तर हज़ार आठ सौ साठ बटा सत्तर @@ -19,3 +19,5 @@ १०००००००००००००/३~एक नील बटा तीन १०००००००००००००००/८~एक पद्म बटा आठ १०००००००००००००००००/४१२~एक शंख बटा चार सौ बारह +२ २/७~दो और दो बटा सात +१२० ७५/९०~एक सौ बीस और पचहत्तर बटा नब्बे \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt index 453369f82..86a824f72 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt @@ -60,3 +60,7 @@ ९९.५ oz~निन्यानबे दशमलव पाँच आउन्स ८५ q~पचासी क्विंटल ८५.९९ q~पचासी दशमलव नौ नौ क्विंटल +२००x१० के गद्दे~दो सौ बाई दस के गद्दे +५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा +२x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब +१३x१३ का घर~तेरह बाई तेरह का घर diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt index c7b32628b..b576dac38 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt @@ -97,4 +97,22 @@ $२८२१~दो हज़ार आठ सौ इक्कीस डॉल ₹५४५~पाँच सौ पैंतालीस रुपए ₹१८४५~एक हज़ार आठ सौ पैंतालीस रुपए ₹३७२~तीन सौ बहत्तर रुपए -$९८~अट्ठानबे डॉलर \ No newline at end of file +$९८~अट्ठानबे डॉलर +₹१२३.५७~एक सौ तेईस रुपए सत्तावन पैसे +₹९९९.५०~नौ सौ निन्यानबे रुपए पचास पैसे +£१५०.२९~एक सौ पचास पाउंड उनतीस पेंस +£८०.३१~अस्सी पाउंड इकतीस पेंस +₩२३४५.१०~दो हज़ार तीन सौ पैंतालीस वॉन दस जिओन +₩१००.२५~एक सौ वॉन पच्चीस जिओन +$१२५.७०~एक सौ पच्चीस डॉलर सत्तर सेंट +$९.९९~नौ डॉलर निन्यानबे सेंट +₺८०.३६~अस्सी लीरा छत्तीस कुरस +₺१२३४.७८~एक हज़ार दो सौ चौंतीस लीरा अठहत्तर कुरस +৳१००.४२~एक सौ टका बयालीस पैसे +৳३०२५.८७~तीन हज़ार पच्चीस टका सत्तासी पैसे +¥१००.४८~एक सौ येन अड़तालीस सेन +¥७७७.२३~सात सौ सतहत्तर येन तेईस सेन +₦८७६.५३~आठ सौ छिहत्तर नाइरा तिरेपन कोबो +₦१०.२७~दस नाइरा सत्ताईस कोबो +€२००.९०~दो सौ यूरो नब्बे सेंट +€१२३४.७५~एक हज़ार दो सौ चौंतीस यूरो पचहत्तर सेंट From c4987310baf6e0568c1dd2c45e87ef55e6fc0226 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Tue, 22 Apr 2025 09:36:22 -0700 Subject: [PATCH 02/16] update jenkins cache Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 53c784920..51ce37a10 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-12-25-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { From 2e6d4e89b0483e8d321b61333f18b42dd70984cd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 22 Apr 2025 16:39:03 +0000 Subject: [PATCH 03/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/measure.py | 16 ++++++++++++++-- .../hi/taggers/tokenize_and_classify.py | 3 ++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index ea6430365..9f1ffbd39 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -62,7 +62,13 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) # Define the quarterly measurements - quarter = pynini.string_map([(".५", "साढ़े"), ("१.५", "डेढ़"), ("२.५", "ढाई"),]) + quarter = pynini.string_map( + [ + (".५", "साढ़े"), + ("१.५", "डेढ़"), + ("२.५", "ढाई"), + ] + ) quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"") # Define the unit handling @@ -70,7 +76,13 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ") # Handling symbols like x, X, * - symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),]) + symbol_graph = pynini.string_map( + [ + ("x", "बाई"), + ("X", "बाई"), + ("*", "बाई"), + ] + ) graph_decimal = ( pynutil.insert("decimal { ") diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index bdec90c06..b1bbd2a10 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -68,7 +68,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", + cache_dir, + f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] From 95883964b892b5d7f5be907c66620514fcc06c31 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Wed, 23 Apr 2025 13:42:28 -0400 Subject: [PATCH 04/16] Potential fix for code scanning alert no. 821: Unused local variable Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> --- nemo_text_processing/text_normalization/hi/taggers/date.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index b8b652128..37b192165 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -73,7 +73,6 @@ def __init__(self, cardinal: GraphFst): delete_dash = pynutil.delete("-") delete_slash = pynutil.delete("/") - delete_comma = pynutil.delete(",") days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space From 714c1ccc118494311f8e0a9f8ce46261f7d4a5e8 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Thu, 28 Aug 2025 00:18:01 +0530 Subject: [PATCH 05/16] Hindi TN Future Implementations 2.0. - Fraction, Measure and Time (#310) * Staging hi tn (#271) * Future Implementations for classes - Measure, Money, and Date (#258) * Future Implementations for classes - Measure, Money, and Date Signed-off-by: Namrata Gachchi * Resolved the conflicts with mm_yyyy and date ranges and added the previously removed failing test cases. Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed the unused empty string implementation Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fixes for the tagger files Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * reformatted decimal final graph Signed-off-by: Namrata Gachchi * incorporated the suggestion for decimal graph Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Century implementations Signed-off-by: Namrata Gachchi * Working on the yyyy format for the date class Signed-off-by: Namrata Gachchi * reverted yyyy code Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on future implementations Signed-off-by: Namrata Gachchi * working on improving the date class accuracy Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added year prefix for the date class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on the commma cases for date class Signed-off-by: Namrata Gachchi * minor fixes Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * implemented mixed fractions Signed-off-by: Namrata Gachchi * rectified the test case Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on quarterly measurements Signed-off-by: Namrata Gachchi * reformatted the prefixes and suffixes for date tagger class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * replaced text tag with era tag for the date class Signed-off-by: Namrata Gachchi * Removed the text tag reference from date class verbalizer Signed-off-by: Namrata Gachchi --------- Signed-off-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * update jenkins cache Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Potential fix for code scanning alert no. 821: Unused local variable Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> --------- Signed-off-by: Namrata Gachchi Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> * Future Implementations Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Namrata Gachchi Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- Jenkinsfile | 2 +- .../hi/data/measure/quarterly_units.tsv | 6 +- .../hi/data/measure/unit.tsv | 1 - .../hi/data/whitelist/paune_mappings.tsv | 100 ++++++++++++++++++ .../text_normalization/hi/taggers/cardinal.py | 16 +-- .../text_normalization/hi/taggers/decimal.py | 4 +- .../text_normalization/hi/taggers/fraction.py | 37 ++++++- .../text_normalization/hi/taggers/measure.py | 55 +++++++++- .../text_normalization/hi/taggers/time.py | 33 +++++- .../hi/taggers/tokenize_and_classify.py | 2 +- .../text_normalization/hi/taggers/word.py | 1 - .../hi/verbalizers/fraction.py | 5 +- .../text_normalization/hi/verbalizers/time.py | 8 +- .../hi/verbalizers/verbalize.py | 2 +- 14 files changed, 243 insertions(+), 29 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv diff --git a/Jenkinsfile b/Jenkinsfile index 51ce37a10..bdddfaf4b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-06-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv index eaddf930a..6bdfb34f8 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv @@ -4,9 +4,11 @@ h घंटे min मिनट doz दर्जन yr साल -yr वर्ष hp हॉर्सपॉवर d दिन month महीना months महीने -हफ़्ते हफ़्ते \ No newline at end of file +हफ़्ते +सप्ताह +सदियां +सदियों \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv index 189512687..4065bc86b 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv @@ -134,7 +134,6 @@ KHz किलोहर्ट्ज़ N न्यूटन dB डेसीबल yr साल -yr वर्ष hp हॉर्सपॉवर d दिन month महीना diff --git a/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv b/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv new file mode 100644 index 000000000..3477871e4 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv @@ -0,0 +1,100 @@ +० एक +१ दो +२ तीन +३ चार +४ पाँच +५ छह +६ सात +७ आठ +८ नौ +९ दस +१० ग्यारह +११ बारह +१२ तेरह +१३ चौदह +१४ पंद्रह +१५ सोलह +१६ सत्रह +१७ अठारह +१८ उन्नीस +१९ बीस +२० इक्कीस +२१ बाईस +२२ तेईस +२३ चौबीस +२४ पच्चीस +२५ छब्बीस +२६ सत्ताईस +२७ अट्ठाईस +२८ उनतीस +२९ तीस +३० इकतीस +३१ बत्तीस +३२ तैंतीस +३३ चौंतीस +३४ पैंतीस +३५ छत्तीस +३६ सैंतीस +३७ अड़तीस +३८ उनतालीस +३९ चालीस +४० इकतालीस +४१ बयालीस +४२ तैंतालीस +४३ चौवालीस +४४ पैंतालीस +४५ छियालीस +४६ सैंतालीस +४७ अड़तालीस +४८ उनचास +४९ पचास +५० इक्यावन +५१ बावन +५२ तिरेपन +५३ चौवन +५४ पचपन +५५ छप्पन +५६ सत्तावन +५७ अट्ठावन +५८ उनसठ +५९ साठ +६० इकसठ +६१ बासठ +६२ तिरेसठ +६३ चौंसठ +६४ पैंसठ +६५ छियासठ +६६ सड़सठ +६७ अड़सठ +६८ उनहत्तर +६९ सत्तर +७० इकहत्तर +७१ बहत्तर +७२ तिहत्तर +७३ चौहत्तर +७४ पचहत्तर +७५ छिहत्तर +७६ सतहत्तर +७७ अठहत्तर +७८ उनासी +७९ अस्सी +८० इक्यासी +८१ बयासी +८२ तिरासी +८३ चौरासी +८४ पचासी +८५ छियासी +८६ सत्तासी +८७ अट्ठासी +८८ नवासी +८९ नब्बे +९० इक्यानबे +९१ बानबे +९२ तिरानबे +९३ चौरानबे +९४ पंचानबे +९५ छियानबे +९६ सत्तानबे +९७ अट्ठानबे +९८ निन्यानबे +९९ एक सौ diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index c50384acf..bc7594ad9 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -21,12 +21,12 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. - -२३ -> cardinal { negative: "true" integer: "तेइस" } } - s - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + Finite state transducer for classifying cardinals, e.g. + -२३ -> cardinal { negative: "true" integer: "तेइस" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True, lm: bool = False): @@ -37,6 +37,10 @@ def __init__(self, deterministic: bool = True, lm: bool = False): teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + self.digit = digit + self.zero = zero + self.teens_and_ties = teens_and_ties + def create_graph_suffix(digit_graph, suffix, zeros_counts): zero = pynutil.add_weight(pynutil.delete("०"), -0.1) if zeros_counts == 0: diff --git a/nemo_text_processing/text_normalization/hi/taggers/decimal.py b/nemo_text_processing/text_normalization/hi/taggers/decimal.py index 955e8c0d3..cb21d85b1 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/decimal.py @@ -58,9 +58,7 @@ class DecimalFst(GraphFst): def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) - graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - graph_digit |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")) - + graph_digit = cardinal.digit | cardinal.zero cardinal_graph = cardinal.final_graph self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index 8971cd3dd..d995608da 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -16,6 +16,7 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.utils import get_abs_path class FractionFst(GraphFst): @@ -47,13 +48,43 @@ def __init__(self, cardinal, deterministic: bool = True): ) self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"") - self.graph = ( + dedh_dhai_graph = pynini.string_map([("१ १/२", "डेढ़"), ("२ १/२", "ढाई")]) + + savva_numbers = cardinal_graph + pynini.cross(" १/४", "") + savva_graph = pynutil.insert("सवा ") + savva_numbers + + sadhe_numbers = cardinal_graph + pynini.cross(" १/२", "") + sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(" ३/४", "") + paune_graph = pynutil.insert("पौने ") + paune_numbers + + graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ") + + graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ") + + graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ") + + graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ") + + final_graph = ( self.optional_graph_negative + pynini.closure(self.integer + pynini.accep(" "), 0, 1) + self.numerator + self.denominator ) + weighted_graph = ( + final_graph + | pynutil.add_weight(graph_dedh_dhai, -0.2) + | pynutil.add_weight(graph_savva, -0.1) + | pynutil.add_weight(graph_sadhe, -0.1) + | pynutil.add_weight(graph_paune, -0.2) + ) + + self.graph = weighted_graph + graph = self.graph - final_graph = self.add_tokens(graph) - self.fst = final_graph.optimize() + graph = self.add_tokens(graph) + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 9f1ffbd39..919a69929 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -41,8 +41,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") cardinal_graph = ( - digit - | teens_and_ties + cardinal.zero + | cardinal.digit + | cardinal.teens_and_ties | cardinal.graph_hundreds | cardinal.graph_thousands | cardinal.graph_ten_thousands @@ -52,6 +53,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): point = pynutil.delete(".") decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional + unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv")) @@ -93,10 +95,50 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + unit ) - graph_quarter = ( + dedh_dhai = pynini.string_map([("१.५", "डेढ़"), ("२.५", "ढाई")]) + dedh_dhai_graph = pynutil.insert("integer: \"") + dedh_dhai + pynutil.insert("\"") + + savva_numbers = cardinal_graph + pynini.cross(".२५", "") + savva_graph = pynutil.insert("integer: \"सवा ") + savva_numbers + pynutil.insert("\"") + + sadhe_numbers = cardinal_graph + pynini.cross(".५", "") + sadhe_graph = pynutil.insert("integer: \"साढ़े ") + sadhe_numbers + pynutil.insert("\"") + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(".७५", "") + paune_graph = pynutil.insert("integer: \"पौने ") + paune_numbers + pynutil.insert("\"") + + graph_dedh_dhai = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + dedh_dhai_graph + + pynutil.insert(" }") + + delete_space + + units + ) + + graph_savva = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + savva_graph + + pynutil.insert(" }") + + delete_space + + units + ) + + graph_sadhe = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + sadhe_graph + + pynutil.insert(" }") + + delete_space + + units + ) + + graph_paune = ( pynutil.insert("cardinal { ") + optional_graph_negative - + quarter_graph + + paune_graph + pynutil.insert(" }") + delete_space + units @@ -135,9 +177,12 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph = ( pynutil.add_weight(graph_decimal, 0.01) - | pynutil.add_weight(graph_quarter, 0.005) | pynutil.add_weight(graph_cardinal, 0.01) | pynutil.add_weight(graph_exceptions, 0.01) + | pynutil.add_weight(graph_dedh_dhai, 0.001) + | pynutil.add_weight(graph_savva, 0.005) + | pynutil.add_weight(graph_sadhe, 0.005) + | pynutil.add_weight(graph_paune, -0.2) ) self.graph = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/time.py b/nemo_text_processing/text_normalization/hi/taggers/time.py index 6c87c9aad..e78b31380 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/text_normalization/hi/taggers/time.py @@ -36,10 +36,11 @@ class TimeFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self): + def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="classify") delete_colon = pynutil.delete(":") + cardinal_graph = cardinal.digit | cardinal.teens_and_ties self.hours = pynutil.insert("hours: \"") + hours_graph + pynutil.insert("\" ") self.minutes = pynutil.insert("minutes: \"") + minutes_graph + pynutil.insert("\" ") @@ -56,7 +57,35 @@ def __init__(self): # hour graph_h = self.hours + delete_colon + pynutil.delete("००") - final_graph = graph_hms | graph_hm | graph_h + dedh_dhai_graph = pynini.string_map([("१:३०", "डेढ़"), ("२:३०", "ढाई")]) + + savva_numbers = cardinal_graph + pynini.cross(":१५", "") + savva_graph = pynutil.insert("सवा ") + savva_numbers + + sadhe_numbers = cardinal_graph + pynini.cross(":३०", "") + sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(":४५", "") + paune_graph = pynutil.insert("पौने ") + paune_numbers + + graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ") + + graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ") + + graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ") + + graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ") + + final_graph = ( + graph_hms + | pynutil.add_weight(graph_hm, 0.01) + | pynutil.add_weight(graph_h, 0.01) + | pynutil.add_weight(graph_dedh_dhai, 0.001) + | pynutil.add_weight(graph_savva, 0.005) + | pynutil.add_weight(graph_sadhe, 0.005) + | pynutil.add_weight(graph_paune, 0.001) + ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index b1bbd2a10..0ce75822e 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -98,7 +98,7 @@ def __init__( logging.debug(f"date: {time.time() - start_time: .2f}s -- {date_graph.num_states()} nodes") start_time = time.time() - timefst = TimeFst() + timefst = TimeFst(cardinal=cardinal) time_graph = timefst.fst logging.debug(f"time: {time.time() - start_time: .2f}s -- {time_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/hi/taggers/word.py b/nemo_text_processing/text_normalization/hi/taggers/word.py index bc354232b..151a72e99 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/word.py +++ b/nemo_text_processing/text_normalization/hi/taggers/word.py @@ -43,7 +43,6 @@ def __init__(self, punctuation: PunctuationFst, deterministic: bool = True): *[chr(i) for i in range(ord("ऀ"), ord("ः") + 1)], # Hindi vowels and consonants *[chr(i) for i in range(ord("अ"), ord("ह") + 1)], # More Hindi characters *[chr(i) for i in range(ord("ा"), ord("्") + 1)], # Hindi diacritics - *[chr(i) for i in range(ord("०"), ord("९") + 1)], # Hindi digits ).optimize() # Include punctuation in the graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index 7e3b33b7c..a07c41eae 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -40,6 +40,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): denominator = pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") insert_bata = pynutil.insert(" बटा ") insert_aur = pynutil.insert(" और ") + graph_quarter = ( + pynutil.delete("morphosyntactic_features: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) fraction_default = numerator + insert_bata + denominator @@ -47,7 +50,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): optional_sign + pynini.closure(pynini.closure(integer, 0, 1) + insert_space + insert_aur) + fraction_default - ) + ) | graph_quarter graph = self.graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/time.py b/nemo_text_processing/text_normalization/hi/verbalizers/time.py index da10df4a0..df232e3cd 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/time.py @@ -30,7 +30,7 @@ class TimeFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self): + def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="verbalize") hour = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space @@ -63,13 +63,17 @@ def __init__(self): + insert_second ) + graph_quarter = ( + pynutil.delete("morphosyntactic_features: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + # hour minute graph_hm = hour + delete_space + insert_bajkar + insert_space + minute + delete_space + insert_minute # hour graph_h = hour + delete_space + insert_baje - self.graph = graph_hms | graph_hm | graph_h + self.graph = graph_hms | graph_hm | graph_h | graph_quarter final_graph = self.graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index e91f0d9f6..60ba05810 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -49,7 +49,7 @@ def __init__(self, deterministic: bool = True): date = DateFst() date_graph = date.fst - time = TimeFst() + time = TimeFst(cardinal=cardinal) time_graph = time.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal) From 68529fd54ed7b82d1b73481ed2d92edf4137bd9d Mon Sep 17 00:00:00 2001 From: shreeshd-tn Date: Tue, 9 Sep 2025 20:57:47 +0530 Subject: [PATCH 06/16] Hindi TN 2.0 - Telephone class integration from staging branch (#320) * telephone class integration (cherry picked from commit a7c9adf48038c95336419eb02543d53a50538e03) Signed-off-by: shreeshd-tn * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: shreeshd-tn * Updated date in Jenkins file to the PR creation date Signed-off-by: shreeshd-tn * Jenkins file date change Signed-off-by: shreeshd-tn * Trying today's date Signed-off-by: shreeshd-tn * improved country code coverage + some test cases Signed-off-by: shreeshd-tn * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Ignore test generated files Signed-off-by: shreeshd-tn * Improved landline detection and added edge test cases for proper coverage Signed-off-by: shreeshd-tn * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Deleted gitignore file Signed-off-by: shreeshd-tn --------- Signed-off-by: shreeshd-tn Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Jenkinsfile | 2 +- .../hi/data/telephone/__init__.py | 13 + .../hi/data/telephone/credit_context.tsv | 3 + .../hi/data/telephone/landline_context.tsv | 5 + .../hi/data/telephone/mobile_context.tsv | 4 + .../hi/data/telephone/number.tsv | 10 + .../hi/data/telephone/pincode_context.tsv | 4 + .../hi/taggers/telephone.py | 227 ++++++++++++++++++ .../hi/taggers/tokenize_and_classify.py | 16 +- .../hi/verbalizers/telephone.py | 72 ++++++ .../hi/verbalizers/verbalize.py | 5 + .../test_cases_telephone.txt | 25 ++ .../hi/test_sparrowhawk_normalization.sh | 8 +- .../nemo_text_processing/hi/test_telephone.py | 11 + 14 files changed, 397 insertions(+), 8 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/telephone/__init__.py create mode 100644 nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/telephone/number.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv create mode 100644 nemo_text_processing/text_normalization/hi/taggers/telephone.py create mode 100644 nemo_text_processing/text_normalization/hi/verbalizers/telephone.py create mode 100644 tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt diff --git a/Jenkinsfile b/Jenkinsfile index bdddfaf4b..86019d132 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-06-25-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-28-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py b/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv new file mode 100644 index 000000000..46b485af6 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv @@ -0,0 +1,3 @@ +नंबर +कार्ड +क्रेडिट \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv new file mode 100644 index 000000000..17a123bee --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv @@ -0,0 +1,5 @@ +नंबर +मोबाइल +फोन +लैंडलाइन +कॉल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv new file mode 100644 index 000000000..f2fa6e52f --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv @@ -0,0 +1,4 @@ +नंबर +मोबाइल +फोन +कॉल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv new file mode 100644 index 000000000..e8c04b723 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv @@ -0,0 +1,10 @@ +0 शून्य +1 एक +2 दो +3 तीन +4 चार +5 पाँच +6 छह +7 सात +8 आठ +9 नौ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv new file mode 100644 index 000000000..322c7248e --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv @@ -0,0 +1,4 @@ +नंबर +पिन +कोड +पिनकोड \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/telephone.py b/nemo_text_processing/text_normalization/hi/taggers/telephone.py new file mode 100644 index 000000000..039e30d74 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/telephone.py @@ -0,0 +1,227 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_CHAR, + NEMO_DIGIT, + NEMO_HI_DIGIT, + NEMO_SPACE, + NEMO_WHITE_SPACE, + GraphFst, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + +delete_zero = pynutil.delete(pynini.union("0", "०")) +delete_zero_optional = pynini.closure(delete_zero, 0, 1) +insert_shunya = pynutil.insert('शून्य') + insert_space + +# Load the number mappings from the TSV file +digit_to_word = pynini.string_file(get_abs_path("data/telephone/number.tsv")) +digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) +mobile_context = pynini.string_file(get_abs_path("data/telephone/mobile_context.tsv")) +landline_context = pynini.string_file(get_abs_path("data/telephone/landline_context.tsv")) +credit_context = pynini.string_file(get_abs_path("data/telephone/credit_context.tsv")) +pincode_context = pynini.string_file(get_abs_path("data/telephone/pincode_context.tsv")) + + +def generate_mobile(context_keywords): + context_before, context_after = get_context(context_keywords) + + allowed_digits = pynini.union("६", "७", "८", "९", "6", "7", "8", "9") + + # Filter cardinals to only include allowed digits + mobile_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word + + country_code_digits = pynini.closure((digit_to_word | digits | zero) + insert_space, 1, 3) + country_code = ( + pynutil.insert("country_code: \"") + + context_before + + pynini.cross("+", "प्लस") + + insert_space + + country_code_digits + + pynutil.insert("\" ") + + pynini.closure(delete_space, 0, 1) + ) + + extension_optional = pynini.closure( + pynutil.insert("extension: \"") + + pynini.closure((digit_to_word | digits | zero) + insert_space, 1, 3) + + context_after + + pynutil.insert("\" ") + + delete_space, + 0, + 1, + ) + + number_part = mobile_start_digit + insert_space + pynini.closure((digit_to_word | digits | zero) + insert_space, 9) + + number_without_country = ( + pynutil.insert("number_part: \"") + + context_before + + delete_zero_optional + + insert_shunya + + number_part + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + number_with_country = ( + country_code + + pynutil.insert("number_part: \"") + + number_part + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + return (number_with_country | number_without_country) + extension_optional + + +def get_landline(std_length, context_keywords): + context_before, context_after = get_context(context_keywords) + + allowed_digits = pynini.union("२", "३", "४", "६", "2", "3", "4", "6") + + # Filter cardinals to only include allowed digits + landline_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word + + std_code_graph = ( + delete_zero_optional + + insert_shunya + + pynini.closure((digit_to_word | digits | zero) + insert_space, std_length, std_length) + ) + + landline_digit_count = 9 - std_length + landline_graph = ( + landline_start_digit + + insert_space + + pynini.closure((digit_to_word | digits | zero) + insert_space, landline_digit_count, landline_digit_count) + ) + + separator_optional = pynini.closure(pynini.cross("-", "") | pynini.cross(".", ""), 0, 1) + + std_code_in_brackets = ( + delete_zero_optional + + delete_space + + pynutil.delete("(") + + pynini.closure(delete_space, 0, 1) + + std_code_graph + + pynini.closure(delete_space, 0, 1) + + pynutil.delete(")") + ) + + std_part = pynini.union(std_code_graph, std_code_in_brackets) + + return ( + pynutil.insert("number_part: \"") + + context_before + + std_part + + separator_optional + + delete_space + + landline_graph + + context_after + + pynutil.insert("\" ") + ) + + +def generate_landline(context_keywords): + graph = ( + get_landline(2, context_keywords) + | get_landline(3, context_keywords) + | get_landline(4, context_keywords) + | get_landline(5, context_keywords) + | get_landline(6, context_keywords) + | get_landline(7, context_keywords) + ) + + return graph + + +def get_context(keywords: list): + + all_digits = pynini.union(NEMO_HI_DIGIT, NEMO_DIGIT) + + non_digit_char = pynini.difference(NEMO_CHAR, pynini.union(all_digits, NEMO_WHITE_SPACE)) + word = pynini.closure(non_digit_char, 1) + pynini.accep(NEMO_SPACE) + + window = pynini.closure(word, 0, 5) + + before = pynini.closure(keywords + pynini.accep(NEMO_SPACE) + window, 0, 1) + + after = pynini.closure(pynutil.delete(NEMO_SPACE) + window + keywords, 0, 1) + + return before.optimize(), after.optimize() + + +def generate_credit(context_keywords): + context_before, context_after = get_context(context_keywords) + return ( + pynutil.insert("number_part: \"") + + context_before + + pynini.closure((digit_to_word | digits | zero) + insert_space, 4) + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + +def generate_pincode(context_keywords): + context_before, context_after = get_context(context_keywords) + return ( + pynutil.insert("number_part: \"") + + context_before + + pynini.closure((digit_to_word | digits | zero) + insert_space, 6) + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for tagging telephone numbers, e.g. + ९१५७११४००७ -> telephone { number_part: "शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात" } + +९१ ९२१०५१५६०६ -> telephone { country_code: "प्लस नौ एक", number_part: "नौ दो एक शून्य पाँच एक पाँच छह शून्य छह" } + १३७४-३०९९८८ -> telephone { number_part: "शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization + """ + + def __init__(self): + super().__init__(name="telephone", kind="classify") + + mobile_number = generate_mobile(mobile_context) + landline = generate_landline(landline_context) + credit_card = generate_credit(credit_context) + pincode = generate_pincode(pincode_context) + + graph = ( + pynutil.add_weight(mobile_number, 0.7) + | pynutil.add_weight(landline, 0.8) + | pynutil.add_weight(credit_card, 0.9) + | pynutil.add_weight(pincode, 1) + ) + + self.final = graph.optimize() + self.fst = self.add_tokens(self.final) diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 0ce75822e..e5c99c26c 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -20,6 +20,7 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_SPACE, NEMO_WHITE_SPACE, GraphFst, delete_extra_space, @@ -33,6 +34,7 @@ from nemo_text_processing.text_normalization.hi.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.text_normalization.hi.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.hi.taggers.word import WordFst @@ -123,6 +125,11 @@ def __init__( punct_graph = punctuation.fst logging.debug(f"punct: {time.time() - start_time: .2f}s -- {punct_graph.num_states()} nodes") + start_time = time.time() + telephone = TelephoneFst() + telephone_graph = telephone.fst + logging.debug(f"telephone: {time.time() - start_time: .2f}s -- {telephone_graph.num_states()} nodes") + classify = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(cardinal_graph, 1.1) @@ -132,6 +139,7 @@ def __init__( | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) ) start_time = time.time() @@ -141,20 +149,22 @@ def __init__( punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(" ") + punct), + | (pynutil.insert(NEMO_SPACE) + punct), 1, ) classify |= pynutil.add_weight(word_graph, 100) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + pynini.closure(punct + pynutil.insert(NEMO_SPACE)) + + token + + pynini.closure(pynutil.insert(NEMO_SPACE) + punct) ) graph = token_plus_punct + pynini.closure( ( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(" ") + punct + pynutil.insert(" ")) + | (pynutil.insert(NEMO_SPACE) + punct + pynutil.insert(NEMO_SPACE)) ) + token_plus_punct ) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py new file mode 100644 index 000000000..a6a677ec3 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py @@ -0,0 +1,72 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + MIN_NEG_WEIGHT, + NEMO_NOT_QUOTE, + NEMO_SPACE, + GraphFst, + delete_space, + insert_space, +) + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing telephone numbers, e.g. + telephone { country_code: "प्लस नौ एक", number_part: "नौ दो एक शून्य पाँच एक पाँच छह शून्य छह" } -> प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह + telephone { number_part: "शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ" } -> शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="telephone", kind="verbalize", deterministic=deterministic) + + optional_country_code = pynini.closure( + pynutil.delete("country_code: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + delete_space + + insert_space, + 0, + 1, + ) + + number_part = ( + pynutil.delete("number_part: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynini.closure(pynutil.add_weight(pynutil.delete(NEMO_SPACE), MIN_NEG_WEIGHT), 0, 1) + + pynutil.delete("\"") + ) + + optional_extension = pynini.closure( + delete_space + + insert_space + + pynutil.delete("extension: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\""), + 0, + 1, + ) + + graph = optional_country_code + number_part + optional_extension + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index 60ba05810..f824a075a 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -19,6 +19,7 @@ from nemo_text_processing.text_normalization.hi.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst +from nemo_text_processing.text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst @@ -58,6 +59,9 @@ def __init__(self, deterministic: bool = True): money = MoneyFst() money_graph = money.fst + telephone = TelephoneFst() + telephone_graph = telephone.fst + whitelist_graph = WhiteListFst(deterministic=deterministic).fst graph = ( @@ -69,6 +73,7 @@ def __init__(self, deterministic: bool = True): | measure_graph | money_graph | whitelist_graph + | telephone_graph ) self.fst = graph diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..7a1b2c662 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt @@ -0,0 +1,25 @@ +मेरा पुराना नंबर था ९१५७११४००७~मेरा पुराना नंबर था शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात +इसपे कॉल करो ०३८६२-३५१७९१~इसपे कॉल करो शून्य तीन आठ छह दो तीन पाँच एक सात नौ एक +मेरे इस नंबर पे कॉल करो १३७४-३०९९८८~मेरे इस नंबर पे कॉल करो शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ +इसपे कॉल करो ०१६८९११-४५७३~इसपे कॉल करो शून्य एक छह आठ नौ एक एक चार पाँच सात तीन ++९१ ७४४०४३१०८३ मेरे इस नंबर पे कॉल करो~प्लस नौ एक सात चार चार शून्य चार तीन एक शून्य आठ तीन मेरे इस नंबर पे कॉल करो ++९१ ९२१०५१५६०६ मेरे इस नंबर पे कॉल करो~प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो +भुगतान के लिए कार्ड के आखिरी अंक १२३४ दर्ज करें~भुगतान के लिए कार्ड के आखिरी अंक एक दो तीन चार दर्ज करें +मेरा पिन कोड ११००२३ है~मेरा पिन कोड एक एक शून्य शून्य दो तीन है +मेरा पुराना नंबर था 9157114007~मेरा पुराना नंबर था शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात +इसपे कॉल करो 03862-351791~इसपे कॉल करो शून्य तीन आठ छह दो तीन पाँच एक सात नौ एक +मेरे इस नंबर पे कॉल करो 1374 309988~मेरे इस नंबर पे कॉल करो शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ +इसपे कॉल करो 0168911-4573~इसपे कॉल करो शून्य एक छह आठ नौ एक एक चार पाँच सात तीन ++91 7440431083 मेरे इस नंबर पे कॉल करो~प्लस नौ एक सात चार चार शून्य चार तीन एक शून्य आठ तीन मेरे इस नंबर पे कॉल करो ++91 9210515606 मेरे इस नंबर पे कॉल करो~प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो +भुगतान के लिए कार्ड के आखिरी अंक 1234 दर्ज करें~भुगतान के लिए कार्ड के आखिरी अंक एक दो तीन चार दर्ज करें +मेरा पिन कोड 110023 है~मेरा पिन कोड एक एक शून्य शून्य दो तीन है ++1 9210515606 मेरे इस नंबर पे कॉल करो~प्लस एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो ++४९ ९२१०५१५६०६ मेरे इस नंबर पे कॉल करो~प्लस चार नौ नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो ++353 9210515606 मेरे इस नंबर पे कॉल करो~प्लस तीन पाँच तीन नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो ++91 9876543210 123~प्लस नौ एक नौ आठ सात छह पाँच चार तीन दो एक शून्य एक दो तीन ++1 6234517890 123~प्लस एक छह दो तीन चार पाँच एक सात आठ नौ शून्य एक दो तीन ++९१ ९८७६५४३२१० १२३~प्लस नौ एक नौ आठ सात छह पाँच चार तीन दो एक शून्य एक दो तीन +(02229) 411128~शून्य दो दो दो नौ चार एक एक एक दो आठ +०२२.२९४१११२८~शून्य दो दो दो नौ चार एक एक एक दो आठ +0 (80) 26411128~शून्य आठ शून्य दो छह चार एक एक एक दो आठ \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh index 498443f71..39d710120 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -81,10 +81,10 @@ testTNMoney() { # runtest $input #} -#testTNTelephone() { -# input=$PROJECT_DIR/en/data_text_normalization/test_cases_telephone.txt -# runtest $input -#} +testTNTelephone() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_telephone.txt + runtest $input +} testTNTime() { input=$PROJECT_DIR/hi/data_text_normalization/test_cases_time.txt diff --git a/tests/nemo_text_processing/hi/test_telephone.py b/tests/nemo_text_processing/hi/test_telephone.py index 7e43f7e82..e7b9f1c3d 100644 --- a/tests/nemo_text_processing/hi/test_telephone.py +++ b/tests/nemo_text_processing/hi/test_telephone.py @@ -16,12 +16,16 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestTelephone: inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_telephone.txt')) @pytest.mark.run_only_on('CPU') @@ -29,3 +33,10 @@ class TestTelephone: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred.strip() == expected.strip() + + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) + assert pred == expected From eb7b3e64bc97e5287a3e900439b84710fed67771 Mon Sep 17 00:00:00 2001 From: shreeshd-tn Date: Fri, 10 Oct 2025 22:31:54 +0530 Subject: [PATCH 07/16] Rebase Hindi TN update: Fix Jenkinsfile for CI (#325) (#331) * Staging hi tn (#271) * Future Implementations for classes - Measure, Money, and Date (#258) * Future Implementations for classes - Measure, Money, and Date Signed-off-by: Namrata Gachchi * Resolved the conflicts with mm_yyyy and date ranges and added the previously removed failing test cases. Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed the unused empty string implementation Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fixes for the tagger files Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * reformatted decimal final graph Signed-off-by: Namrata Gachchi * incorporated the suggestion for decimal graph Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Century implementations Signed-off-by: Namrata Gachchi * Working on the yyyy format for the date class Signed-off-by: Namrata Gachchi * reverted yyyy code Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on future implementations Signed-off-by: Namrata Gachchi * working on improving the date class accuracy Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added year prefix for the date class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on the commma cases for date class Signed-off-by: Namrata Gachchi * minor fixes Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * implemented mixed fractions Signed-off-by: Namrata Gachchi * rectified the test case Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on quarterly measurements Signed-off-by: Namrata Gachchi * reformatted the prefixes and suffixes for date tagger class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * replaced text tag with era tag for the date class Signed-off-by: Namrata Gachchi * Removed the text tag reference from date class verbalizer Signed-off-by: Namrata Gachchi --------- Signed-off-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * update jenkins cache Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Potential fix for code scanning alert no. 821: Unused local variable Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> --------- Signed-off-by: Namrata Gachchi Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: shreeshd-tn * Fix Jenkinsfile for CI (#325) * Fix Jenkinsfile for CI Signed-off-by: Anand Joseph * Fix requirements for test Signed-off-by: Anand Joseph * Update paths and docker Signed-off-by: Anand Joseph * Fix docker name Signed-off-by: Anand Joseph * Fix click version Signed-off-by: Anand Joseph * Change path of grammars for sparrowhawk tests Signed-off-by: Anand Joseph * Update paths in sh_test.sh Signed-off-by: Anand Joseph * Update paths Signed-off-by: Anand Joseph * Revert paths Signed-off-by: Anand Joseph --------- Signed-off-by: Anand Joseph Signed-off-by: shreeshd-tn * Future Implementations for classes - Measure, Money, and Date (#258) * Future Implementations for classes - Measure, Money, and Date Signed-off-by: Namrata Gachchi * Resolved the conflicts with mm_yyyy and date ranges and added the previously removed failing test cases. Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed the unused empty string implementation Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fixes for the tagger files Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * reformatted decimal final graph Signed-off-by: Namrata Gachchi * incorporated the suggestion for decimal graph Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Century implementations Signed-off-by: Namrata Gachchi * Working on the yyyy format for the date class Signed-off-by: Namrata Gachchi * reverted yyyy code Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on future implementations Signed-off-by: Namrata Gachchi * working on improving the date class accuracy Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added year prefix for the date class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on the commma cases for date class Signed-off-by: Namrata Gachchi * minor fixes Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * implemented mixed fractions Signed-off-by: Namrata Gachchi * rectified the test case Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on quarterly measurements Signed-off-by: Namrata Gachchi * reformatted the prefixes and suffixes for date tagger class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * replaced text tag with era tag for the date class Signed-off-by: Namrata Gachchi * Removed the text tag reference from date class verbalizer Signed-off-by: Namrata Gachchi --------- Signed-off-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: shreeshd-tn * update jenkins cache Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: shreeshd-tn * Potential fix for code scanning alert no. 821: Unused local variable Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Signed-off-by: shreeshd-tn * Hindi TN Future Implementations 2.0. - Fraction, Measure and Time (#310) * Staging hi tn (#271) * Future Implementations for classes - Measure, Money, and Date (#258) * Future Implementations for classes - Measure, Money, and Date Signed-off-by: Namrata Gachchi * Resolved the conflicts with mm_yyyy and date ranges and added the previously removed failing test cases. Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed the unused empty string implementation Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fixes for the tagger files Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * reformatted decimal final graph Signed-off-by: Namrata Gachchi * incorporated the suggestion for decimal graph Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Century implementations Signed-off-by: Namrata Gachchi * Working on the yyyy format for the date class Signed-off-by: Namrata Gachchi * reverted yyyy code Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on future implementations Signed-off-by: Namrata Gachchi * working on improving the date class accuracy Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added year prefix for the date class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on the commma cases for date class Signed-off-by: Namrata Gachchi * minor fixes Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * implemented mixed fractions Signed-off-by: Namrata Gachchi * rectified the test case Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on quarterly measurements Signed-off-by: Namrata Gachchi * reformatted the prefixes and suffixes for date tagger class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * replaced text tag with era tag for the date class Signed-off-by: Namrata Gachchi * Removed the text tag reference from date class verbalizer Signed-off-by: Namrata Gachchi --------- Signed-off-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * update jenkins cache Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Potential fix for code scanning alert no. 821: Unused local variable Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> --------- Signed-off-by: Namrata Gachchi Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> * Future Implementations Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Namrata Gachchi Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: shreeshd-tn * Hindi TN 2.0 - Telephone class integration from staging branch (#320) * telephone class integration (cherry picked from commit a7c9adf48038c95336419eb02543d53a50538e03) Signed-off-by: shreeshd-tn * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: shreeshd-tn * Updated date in Jenkins file to the PR creation date Signed-off-by: shreeshd-tn * Jenkins file date change Signed-off-by: shreeshd-tn * Trying today's date Signed-off-by: shreeshd-tn * improved country code coverage + some test cases Signed-off-by: shreeshd-tn * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Ignore test generated files Signed-off-by: shreeshd-tn * Improved landline detection and added edge test cases for proper coverage Signed-off-by: shreeshd-tn * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Deleted gitignore file Signed-off-by: shreeshd-tn --------- Signed-off-by: shreeshd-tn Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: shreeshd-tn * Ran tests successfuly and updated cache date to today Signed-off-by: shreeshd-tn --------- Signed-off-by: Namrata Gachchi Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Signed-off-by: shreeshd-tn Signed-off-by: Anand Joseph Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Co-authored-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Co-authored-by: Mariana Graterol Fuenmayor --- .pre-commit-config.yaml | 2 +- Jenkinsfile | 91 +++++++++++++++++------------- requirements/requirements_test.txt | 6 +- 3 files changed, 55 insertions(+), 44 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cbc636f1a..a2886d56e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,4 +50,4 @@ repos: - id: black name: Format code args: [--skip-string-normalization, --line-length=119] - additional_dependencies: ['click==8.0.2'] + additional_dependencies: ['click>=8.0.2'] diff --git a/Jenkinsfile b/Jenkinsfile index 86019d132..b64e7e240 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,8 +1,8 @@ pipeline { agent { docker { - image 'tnitn_ci:py310' - args '--user 0:128 -v /home/jenkinsci:/home/jenkinsci -v $HOME/.cache:/root/.cache --shm-size=4g --entrypoint=""' + image 'tnitn_ci_py310:24.07' + args '-v /mnt/jenkins/jenkinsci:/home/jenkins -v $HOME/.cache:/root/.cache --shm-size=4g --entrypoint=""' } } options { @@ -27,17 +27,11 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-28-25-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-29-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { - stage('Add git safe directory'){ - steps{ - sh 'git config --global --add safe.directory /var/lib/jenkins/workspace/NTP_$GIT_BRANCH' - sh 'git config --global --add safe.directory /home/jenkinsci/workspace/NTP_$GIT_BRANCH' - } - } stage('PyTorch version') { steps { @@ -46,14 +40,6 @@ pipeline { } } - stage('Install test requirements') { - steps { - sh 'apt-get update && apt-get install -y bc' - } - } - - - stage('NeMo Installation') { steps { sh './reinstall.sh release' @@ -65,7 +51,10 @@ pipeline { when { anyOf { branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' + } } failFast true @@ -97,6 +86,8 @@ pipeline { when { anyOf { branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' } } @@ -120,6 +111,8 @@ pipeline { when { anyOf { branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' } } @@ -156,7 +149,9 @@ pipeline { stage('L0: Create AR TN/ITN Grammars') { when { anyOf { - branch 'main' + branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' } } @@ -179,7 +174,9 @@ pipeline { stage('L0: Create FR TN/ITN & VI ITN & HU TN & IT TN') { when { anyOf { - branch 'main' + branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' } } @@ -216,7 +213,9 @@ pipeline { stage('L0: Create RU TN/ITN Grammars & SV & PT') { when { anyOf { - branch 'main' + branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' } } @@ -258,7 +257,9 @@ pipeline { stage('L0: Create HY TN/ITN Grammars & MR') { when { anyOf { - branch 'main' + branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' } } @@ -284,7 +285,9 @@ pipeline { stage('L0: Create ZH TN/ITN Grammar') { when { anyOf { - branch 'main' + branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' } } @@ -305,7 +308,9 @@ pipeline { stage('L0: Create JA ITN Grammars') { when { anyOf { - branch 'main' + branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' } } @@ -325,7 +330,9 @@ pipeline { stage('L1: TN/ITN Tests CPU') { when { anyOf { - branch 'main' + branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' } } @@ -409,10 +416,12 @@ pipeline { } } - stage('L2: Sparrowhawk Tests') { + stage('L2: EN Sparrowhawk Tests') { when { anyOf { - branch 'main' + branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' } } @@ -441,11 +450,13 @@ pipeline { } } - + stage('L2: NeMo text processing') { when { anyOf { - branch 'main' + branch 'main' + branch 'staging/**' + branch 'staging_*' changeRequest target: 'main' } } @@ -453,23 +464,23 @@ pipeline { parallel { stage('L2: Eng TN') { steps { - sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && \ + sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkins/TestData/text_norm/output_${TIME} && \ cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1' - sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \ - cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/jenkinsci/TestData/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=$NORM_OUTPUT_DIR/test.pynini.txt --verbose && \ + sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkins/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \ + cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/jenkins/TestData/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=$NORM_OUTPUT_DIR/test.pynini.txt --verbose && \ cat $NORM_OUTPUT_DIR/test.pynini.txt && \ - cmp --silent $NORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_norm/ci/test_goal_py.txt || exit 1 && \ + cmp --silent $NORM_OUTPUT_DIR/test.pynini.txt /home/jenkins/TestData/text_norm/ci/test_goal_py.txt || exit 1 && \ rm -rf $NORM_OUTPUT_DIR' } } stage('L2: Eng ITN export') { steps { - sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && \ + sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkins/TestData/text_denorm/output_${TIME} && \ cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1' - sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \ - cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkinsci/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \ - cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \ + sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkins/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \ + cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkins/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \ + cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkins/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \ rm -rf $DENORM_OUTPUT_DIR' } } @@ -477,18 +488,18 @@ pipeline { stage('L2: Eng alignment TN') { steps { - sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \ + sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkins/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \ cd nemo_text_processing/fst_alignment && python alignment.py --text="2615 Forest Av, 90501 CA, Santa Clara. 10kg, 12/16/2018" --grammar=tn --rule=tokenize_and_classify --fst=${EN_TN_CACHE}/en_tn_True_deterministic_cased__tokenize.far 2>&1 | tee $NORM_OUTPUT_DIR/pred.txt && \ - cmp --silent $NORM_OUTPUT_DIR/pred.txt /home/jenkinsci/TestData/text_norm/ci/alignment_gold.txt || exit 1 && \ + cmp --silent $NORM_OUTPUT_DIR/pred.txt /home/jenkins/TestData/text_norm/ci/alignment_gold.txt || exit 1 && \ rm -rf $NORM_OUTPUT_DIR' } } stage('L2: Eng alignment ITN') { steps { - sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \ + sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkins/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \ cd nemo_text_processing/fst_alignment && python alignment.py --text="one million twenty three thousand two hundred eleven ten kilograms one hundred twenty three dollars and twenty five cents" --grammar=itn --rule=tokenize_and_classify --fst=${EN_TN_CACHE}/en_itn_lower_cased.far 2>&1 | tee $DENORM_OUTPUT_DIR/pred.txt && \ - cmp --silent $DENORM_OUTPUT_DIR/pred.txt /home/jenkinsci/TestData/text_denorm/ci/alignment_gold.txt || exit 1 && \ + cmp --silent $DENORM_OUTPUT_DIR/pred.txt /home/jenkins/TestData/text_denorm/ci/alignment_gold.txt || exit 1 && \ rm -rf $DENORM_OUTPUT_DIR' } } diff --git a/requirements/requirements_test.txt b/requirements/requirements_test.txt index a3e90e5dc..aacfde319 100644 --- a/requirements/requirements_test.txt +++ b/requirements/requirements_test.txt @@ -1,6 +1,6 @@ -black==19.10b0 -click==8.0.2 -isort[requirements]>5.1.0,<6.0.0 +black==25.1.0 +click>=8.0.2 +isort[requirements]>5.1.0,<=6.0.1 parameterized pynini==2.1.6.post1 pytest From dd0b8b7ccf54182f9e6a76c4d28e4b1ea4d770b8 Mon Sep 17 00:00:00 2001 From: shreeshd-tn Date: Fri, 17 Oct 2025 21:20:02 +0530 Subject: [PATCH 08/16] Hindi TN: Ordinal Implementation (#343) * Adding ordinals into staging_hi_tn Signed-off-by: shreeshd-tn * Ordinal Cleanup Signed-off-by: shreeshd-tn * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Review changes Signed-off-by: shreeshd-tn * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: shreeshd-tn Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Jenkinsfile | 2 +- .../hi/data/measure/quarterly_units.tsv | 2 +- .../hi/data/ordinal/suffixes.tsv | 4 ++ .../text_normalization/hi/taggers/measure.py | 1 - .../text_normalization/hi/taggers/ordinal.py | 44 ++++++++++++++++ .../hi/taggers/tokenize_and_classify.py | 7 +++ .../hi/verbalizers/ordinal.py | 38 ++++++++++++++ .../hi/verbalizers/verbalize.py | 4 ++ .../test_cases_ordinal.txt | 52 +++++++++++++++++++ tests/nemo_text_processing/hi/test_ordinal.py | 11 ++++ .../hi/test_sparrowhawk_normalization.sh | 8 +-- 11 files changed, 166 insertions(+), 7 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv create mode 100644 nemo_text_processing/text_normalization/hi/taggers/ordinal.py create mode 100644 nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py create mode 100644 tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt diff --git a/Jenkinsfile b/Jenkinsfile index b64e7e240..6e08993d8 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-29-25-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv index 6bdfb34f8..5466df709 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv @@ -11,4 +11,4 @@ months महीने हफ़्ते सप्ताह सदियां -सदियों \ No newline at end of file +सदियों diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv new file mode 100644 index 000000000..37cd2af06 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv @@ -0,0 +1,4 @@ +वां +वीं +वें +वे वें \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 919a69929..575b3d5d5 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -53,7 +53,6 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): point = pynutil.delete(".") decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional - unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv")) diff --git a/nemo_text_processing/text_normalization/hi/taggers/ordinal.py b/nemo_text_processing/text_normalization/hi/taggers/ordinal.py new file mode 100644 index 000000000..51cbd666a --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/ordinal.py @@ -0,0 +1,44 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying Hindi ordinals, e.g. + १०वां -> ordinal { integer: "दसवां" } + २१वीं -> ordinal { integer: "इक्कीसवीं" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: CardinalFst, deterministic: bool = True): + super().__init__(name="ordinal", kind="classify", deterministic=deterministic) + + suffixes_fst = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv")) + + graph = cardinal.final_graph + suffixes_fst + + final_graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index e5c99c26c..ceaf74689 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -33,6 +33,7 @@ from nemo_text_processing.text_normalization.hi.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.hi.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst +from nemo_text_processing.text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst @@ -114,6 +115,11 @@ def __init__( money_graph = money.fst logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") + start_time = time.time() + ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) + ordinal_graph = ordinal.fst + logging.debug(f"ordinal: {time.time() - start_time: .2f}s -- {ordinal_graph.num_states()} nodes") + start_time = time.time() whitelist_graph = WhiteListFst( input_case=input_case, deterministic=deterministic, input_file=whitelist @@ -140,6 +146,7 @@ def __init__( | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) ) start_time = time.time() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py new file mode 100644 index 000000000..ab88603f6 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing Hindi ordinals, e.g. + ordinal { integer: "दसवां" } -> दसवां + ordinal { integer: "इक्कीसवीं" } -> इक्कीसवीं + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) + + integer_value = delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph = pynutil.delete("integer:") + integer_value + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index f824a075a..12ae316b1 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -19,6 +19,7 @@ from nemo_text_processing.text_normalization.hi.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst +from nemo_text_processing.text_normalization.hi.verbalizers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst @@ -61,6 +62,8 @@ def __init__(self, deterministic: bool = True): telephone = TelephoneFst() telephone_graph = telephone.fst + ordinal = OrdinalFst(deterministic=deterministic) + ordinal_graph = ordinal.fst whitelist_graph = WhiteListFst(deterministic=deterministic).fst @@ -72,6 +75,7 @@ def __init__(self, deterministic: bool = True): | time_graph | measure_graph | money_graph + | ordinal_graph | whitelist_graph | telephone_graph ) diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..d1a072d0c --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,52 @@ +५वां~पाँचवां +५वीं~पाँचवीं +७वां~सातवां +७वीं~सातवीं +८वां~आठवां +८वीं~आठवीं +९वां~नौवां +९वीं~नौवीं +११वां~ग्यारहवां +१२वीं~बारहवीं +१४वां~चौदहवां +१६वीं~सोलहवीं +१७वां~सत्रहवां +१८वीं~अठारहवीं +१९वां~उन्नीसवां +२०वां~बीसवां +२१वां~इक्कीसवां +२५वीं~पच्चीसवीं +२७वें~सत्ताईसवें +३०वीं~तीसवीं +३३वां~तैंतीसवां +४०वीं~चालीसवीं +४५वां~पैंतालीसवां +५०वां~पचासवां +५६वें~छप्पनवें +६०वां~साठवां +६७वीं~सड़सठवीं +७५वीं~पचहत्तरवीं +८०वें~अस्सीवें +८८वां~अट्ठासीवां +९१वीं~इक्यानबेवीं +९९वां~निन्यानबेवां +१००वां~एक सौवां +१०१वां~एक सौ एकवां +१११वीं~एक सौ ग्यारहवीं +१२५वें~एक सौ पच्चीसवें +१५३वीं~एक सौ तिरेपनवीं +२००वीं~दो सौवीं +२१९वीं~दो सौ उन्नीसवीं +२४०वां~दो सौ चालीसवां +३२९वां~तीन सौ उनतीसवां +३६५वां~तीन सौ पैंसठवां +४५५वां~चार सौ पचपनवां +५५५वीं~पाँच सौ पचपनवीं +६४०वीं~छह सौ चालीसवीं +८९०वां~आठ सौ नब्बेवां +१००१वीं~एक हज़ार एकवीं +१०९१वें~एक हज़ार इक्यानबेवें +१७८२वीं~सत्रह सौ बयासीवीं +१८९०वां~एक हज़ार आठ सौ नब्बेवां +१९८१वीं~उन्नीस सौ इक्यासीवीं +९८२६वीं~अट्ठानबे सौ छब्बीसवीं \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_ordinal.py b/tests/nemo_text_processing/hi/test_ordinal.py index b65252694..3e5f4bfbb 100644 --- a/tests/nemo_text_processing/hi/test_ordinal.py +++ b/tests/nemo_text_processing/hi/test_ordinal.py @@ -17,13 +17,24 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestOrdinal: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_ordinal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh index 39d710120..a0b0931e2 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -76,10 +76,10 @@ testTNMoney() { runtest $input } -#testTNOrdinal() { -# input=$PROJECT_DIR/hi/data_text_normalization/test_cases_ordinal.txt -# runtest $input -#} +testTNOrdinal() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_ordinal.txt + runtest $input +} testTNTelephone() { input=$PROJECT_DIR/hi/data_text_normalization/test_cases_telephone.txt From 96ba6a260ffcdf272e50f1a8aaf313b5dad81097 Mon Sep 17 00:00:00 2001 From: shreeshd-tn Date: Wed, 22 Oct 2025 21:13:16 +0530 Subject: [PATCH 09/16] Hindi TN: Main to staging Fix + Cardinals (leading zero update) (#348) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Staging hi tn (#271) * Future Implementations for classes - Measure, Money, and Date (#258) * Future Implementations for classes - Measure, Money, and Date Signed-off-by: Namrata Gachchi * Resolved the conflicts with mm_yyyy and date ranges and added the previously removed failing test cases. Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed the unused empty string implementation Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fixes for the tagger files Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * reformatted decimal final graph Signed-off-by: Namrata Gachchi * incorporated the suggestion for decimal graph Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Century implementations Signed-off-by: Namrata Gachchi * Working on the yyyy format for the date class Signed-off-by: Namrata Gachchi * reverted yyyy code Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on future implementations Signed-off-by: Namrata Gachchi * working on improving the date class accuracy Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added year prefix for the date class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on the commma cases for date class Signed-off-by: Namrata Gachchi * minor fixes Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * implemented mixed fractions Signed-off-by: Namrata Gachchi * rectified the test case Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * working on quarterly measurements Signed-off-by: Namrata Gachchi * reformatted the prefixes and suffixes for date tagger class Signed-off-by: Namrata Gachchi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * replaced text tag with era tag for the date class Signed-off-by: Namrata Gachchi * Removed the text tag reference from date class verbalizer Signed-off-by: Namrata Gachchi --------- Signed-off-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * update jenkins cache Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Potential fix for code scanning alert no. 821: Unused local variable Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> --------- Signed-off-by: Namrata Gachchi Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> * Fix Jenkinsfile for CI (#325) * Fix Jenkinsfile for CI Signed-off-by: Anand Joseph * Fix requirements for test Signed-off-by: Anand Joseph * Update paths and docker Signed-off-by: Anand Joseph * Fix docker name Signed-off-by: Anand Joseph * Fix click version Signed-off-by: Anand Joseph * Change path of grammars for sparrowhawk tests Signed-off-by: Anand Joseph * Update paths in sh_test.sh Signed-off-by: Anand Joseph * Update paths Signed-off-by: Anand Joseph * Revert paths Signed-off-by: Anand Joseph --------- Signed-off-by: Anand Joseph * Comma bugfix for En electronics (#332) * fix bug with commas and electronics Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Mariana Graterol Fuenmayor * update jenkins Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Update Jenkinsfile (#341) Only mount TestData from path Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> * [pre-commit.ci] pre-commit suggestions (#335) updates: - [github.com/pre-commit/pre-commit-hooks: v5.0.0 → v6.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v5.0.0...v6.0.0) - [github.com/PyCQA/flake8: 7.2.0 → 7.3.0](https://github.com/PyCQA/flake8/compare/7.2.0...7.3.0) - [github.com/PyCQA/isort: 6.0.1 → 6.1.0](https://github.com/PyCQA/isort/compare/6.0.1...6.1.0) - https://github.com/psf/black → https://github.com/psf/black-pre-commit-mirror - [github.com/psf/black-pre-commit-mirror: 25.1.0 → 25.9.0](https://github.com/psf/black-pre-commit-mirror/compare/25.1.0...25.9.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Cardinal: Leading zero changes Signed-off-by: shreeshd-tn --------- Signed-off-by: Namrata Gachchi Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Signed-off-by: Anand Joseph Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: shreeshd-tn Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Namrata Gachchi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Co-authored-by: anand-nv <105917641+anand-nv@users.noreply.github.com> --- .pre-commit-config.yaml | 10 +++++----- .../text_normalization/en/taggers/electronic.py | 7 ++++--- .../text_normalization/hi/taggers/cardinal.py | 9 ++++++++- .../data_text_normalization/test_cases_electronic.txt | 3 ++- .../hi/data_text_normalization/test_cases_cardinal.txt | 2 ++ 5 files changed, 21 insertions(+), 10 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a2886d56e..fca523e58 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ ci: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: check-yaml - id: check-case-conflict @@ -30,22 +30,22 @@ repos: - id: requirements-txt-fixer - repo: https://github.com/PyCQA/flake8 - rev: 7.2.0 + rev: 7.3.0 hooks: - id: flake8 args: - --select=W605 - repo: https://github.com/PyCQA/isort - rev: 6.0.1 + rev: 6.1.0 hooks: - id: isort name: Format imports args: [ --multi-line=3, --trailing-comma, --force-grid-wrap=0, --use-parentheses, --line-width=119, -rc, -ws ] exclude: docs/ - - repo: https://github.com/psf/black - rev: 25.1.0 + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 25.9.0 hooks: - id: black name: Format code diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index 874d2e437..25c3c445a 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -127,14 +127,15 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): full_stop_accep = pynini.accep(".") dollar_accep = pynini.accep("$") # Include for the correct transduction of the money graph - excluded_symbols = full_stop_accep | dollar_accep + excluded_symbols = full_stop_accep | dollar_accep | pynini.accep(",") filtered_symbols = pynini.difference(accepted_symbols, excluded_symbols) accepted_characters = NEMO_ALPHA | NEMO_DIGIT | filtered_symbols domain_component = full_stop_accep + pynini.closure(accepted_characters, 2) - graph_domain = ( + graph_domain = pynutil.add_weight( pynutil.insert('domain: "') + (pynini.closure(accepted_characters, 1) + pynini.closure(domain_component, 1)) - + pynutil.insert('"') + + pynutil.insert('"'), + 0.1, ).optimize() graph |= pynutil.add_weight(graph_domain, MIN_NEG_WEIGHT) diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index bc7594ad9..f361416f4 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space from nemo_text_processing.text_normalization.hi.utils import get_abs_path @@ -298,6 +298,12 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 0, graph_ten_padmas) graph_ten_shankhs.optimize() + # Only match exactly 2 digits to avoid interfering with telephone numbers, decimals, etc. + # e.g., "०५" -> "शून्य पाँच" + single_digit = digit | zero + graph_leading_zero = zero + insert_space + single_digit + graph_leading_zero = pynutil.add_weight(graph_leading_zero, 0.5) + final_graph = ( digit | zero @@ -319,6 +325,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): | graph_ten_padmas | graph_shankhs | graph_ten_shankhs + | graph_leading_zero ) optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt index 3a306158b..498528463 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt @@ -41,4 +41,5 @@ https://www.nvidia.com/dgx-basepod/~HTTPS colon slash slash WWW dot NVIDIA dot c i can use your card ending in 8876~i can use your card ending in eight eight seven six upgrade/update~upgrade slash update upgrade / update~upgrade slash update -upgrade/update/downgrade~upgrade slash update slash downgrade \ No newline at end of file +upgrade/update/downgrade~upgrade slash update slash downgrade +5.4, or 5.5~five point four, or five point five \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt index 6ba21de69..2a52b2a20 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt @@ -143,3 +143,5 @@ ११०२२३४५५६७~ग्यारह अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ ५१०२२३४५५६७~इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ २ पॉइंट्स १२ गोल~दो पॉइंट्स बारह गोल +०५~शून्य पाँच +०१~शून्य एक \ No newline at end of file From 5e89a8143e615380af8ba57e3c0a783a15c5e13c Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Wed, 22 Oct 2025 11:34:09 -0700 Subject: [PATCH 10/16] debug file issue Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 2 +- .../text_normalization/hi/data/ordinal/suffixes.tsv | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7569f2c2b..a83af7ca1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -26,7 +26,7 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-22-25-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-22-25-1' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv index 37cd2af06..878536c80 100644 --- a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv @@ -1,4 +1,3 @@ वां वीं -वें -वे वें \ No newline at end of file +वें \ No newline at end of file From 262cd6e8a07d8031d820355303eec7be72090112 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Wed, 22 Oct 2025 12:14:51 -0700 Subject: [PATCH 11/16] debug ordinals error Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 2 +- .../text_normalization/hi/data/ordinal/suffixes.tsv | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index a83af7ca1..2e834e5fe 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -26,7 +26,7 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-22-25-1' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-22-25-2' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv index 878536c80..2abb5c492 100644 --- a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv @@ -1,3 +1 @@ -वां -वीं -वें \ No newline at end of file +वे वें \ No newline at end of file From f364fbb536bf733d14f4ae5dbc12baddfc738fbb Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Wed, 22 Oct 2025 12:20:16 -0700 Subject: [PATCH 12/16] ci debug Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 2 +- .../text_normalization/hi/data/ordinal/suffixes.tsv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 2e834e5fe..c75d15bdd 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -26,7 +26,7 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-22-25-2' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-22-25-3' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv index 2abb5c492..b92f958e4 100644 --- a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv @@ -1 +1 @@ -वे वें \ No newline at end of file +a a \ No newline at end of file From 500e1fc2cbed5bb49b8cb8179c7b2373a107d220 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Wed, 22 Oct 2025 12:32:52 -0700 Subject: [PATCH 13/16] revert to original suffixes for ordinals Signed-off-by: Mariana Graterol Fuenmayor --- .../text_normalization/hi/data/ordinal/suffixes.tsv | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv index b92f958e4..37cd2af06 100644 --- a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv @@ -1 +1,4 @@ -a a \ No newline at end of file +वां +वीं +वें +वे वें \ No newline at end of file From aa22d29053c2ae899b12213273584fa11161f284 Mon Sep 17 00:00:00 2001 From: shreeshd-tn Date: Thu, 23 Oct 2025 22:31:04 +0530 Subject: [PATCH 14/16] CI fix: Missing init file (#350) Signed-off-by: shreeshd-tn --- .../text_normalization/hi/data/ordinal/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 nemo_text_processing/text_normalization/hi/data/ordinal/__init__.py diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/__init__.py b/nemo_text_processing/text_normalization/hi/data/ordinal/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From df5b3dc1510307b2c2d9fce80a536fc285678bde Mon Sep 17 00:00:00 2001 From: shreeshd-tn Date: Fri, 31 Oct 2025 21:09:43 +0530 Subject: [PATCH 15/16] HI TN: Staging branch cleanup for main merge (#355) * Review changes - cleanup Signed-off-by: shreeshd-tn * Missed cleanup Signed-off-by: shreeshd-tn * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: shreeshd-tn Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../hi/data/measure/quarterly_units_list.tsv | 5 + ...erly_units.tsv => quarterly_units_map.tsv} | 5 +- .../hi/data/ordinal/exceptions.tsv | 12 ++ .../hi/data/ordinal/suffixes.tsv | 1 - .../hi/data/ordinal/suffixes_map.tsv | 2 + .../text_normalization/hi/graph_utils.py | 7 ++ .../text_normalization/hi/taggers/date.py | 15 +-- .../text_normalization/hi/taggers/fraction.py | 69 ++++++++--- .../text_normalization/hi/taggers/measure.py | 109 +++++++++++++----- .../text_normalization/hi/taggers/ordinal.py | 7 +- .../hi/taggers/punctuation.py | 6 +- .../hi/taggers/telephone.py | 59 +++++----- .../text_normalization/hi/taggers/time.py | 75 ++++++++---- .../hi/taggers/tokenize_and_classify.py | 41 ++----- .../text_normalization/hi/taggers/word.py | 6 +- .../hi/verbalizers/telephone.py | 2 +- .../test_cases_ordinal.txt | 10 ++ 17 files changed, 285 insertions(+), 146 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv rename nemo_text_processing/text_normalization/hi/data/measure/{quarterly_units.tsv => quarterly_units_map.tsv} (71%) create mode 100644 nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv new file mode 100644 index 000000000..6fcfb8b3a --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv @@ -0,0 +1,5 @@ +हफ़्ते +सप्ताह +सदियां +सदियों + diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv similarity index 71% rename from nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv rename to nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv index 5466df709..dc20bcb21 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv @@ -8,7 +8,4 @@ hp हॉर्सपॉवर d दिन month महीना months महीने -हफ़्ते -सप्ताह -सदियां -सदियों + diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv new file mode 100644 index 000000000..bfe5738d0 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv @@ -0,0 +1,12 @@ +१ला पहला +१ली पहली +२रा दूसरा +२री दूसरी +३रा तीसरा +३री तीसरी +४था चौथा +४थी चौथी +५वां पाँचवां +५वीं पाँचवीं +६ठा छठा +६ठी छठी diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv index 37cd2af06..922e9d6b8 100644 --- a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv @@ -1,4 +1,3 @@ वां वीं वें -वे वें \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv new file mode 100644 index 000000000..77139cff5 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv @@ -0,0 +1,2 @@ +वे वें + diff --git a/nemo_text_processing/text_normalization/hi/graph_utils.py b/nemo_text_processing/text_normalization/hi/graph_utils.py index 6a5d3c699..5bbc736fd 100644 --- a/nemo_text_processing/text_normalization/hi/graph_utils.py +++ b/nemo_text_processing/text_normalization/hi/graph_utils.py @@ -30,6 +30,13 @@ NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() NEMO_HI_NON_ZERO = pynini.union("१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() NEMO_HI_ZERO = "०" + +HI_DEDH = "डेढ़" # 1.5 +HI_DHAI = "ढाई" # 2.5 +HI_SAVVA = "सवा" # quarter more (1.25) +HI_SADHE = "साढ़े" # half more (X.5) +HI_PAUNE = "पौने" # quarter less (0.75) + NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 37b192165..b25abcac6 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -65,11 +65,11 @@ def __init__(self, cardinal: GraphFst): (NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand ) - cardinal_graph = ( - digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands + cardinal_graph = pynini.union( + digit, teens_and_ties, cardinal.graph_hundreds, graph_year_thousands, graph_year_hundreds_as_thousands ) - graph_year = graph_year_thousands | graph_year_hundreds_as_thousands + graph_year = pynini.union(graph_year_thousands, graph_year_hundreds_as_thousands) delete_dash = pynutil.delete("-") delete_slash = pynutil.delete("/") @@ -102,13 +102,10 @@ def __init__(self, cardinal: GraphFst): # Updated logic to use prefix_union year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") - graph_dd_mm_yyyy = ( - days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph - ) + delete_separator = pynini.union(delete_dash, delete_slash) + graph_dd_mm_yyyy = days_graph + delete_separator + months_graph + delete_separator + years_graph - graph_mm_dd_yyyy = ( - months_graph + (delete_dash | delete_slash) + days_graph + (delete_dash | delete_slash) + years_graph - ) + graph_mm_dd_yyyy = months_graph + delete_separator + days_graph + delete_separator + years_graph graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index d995608da..b5528deba 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -15,9 +15,21 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.graph_utils import ( + HI_DEDH, + HI_DHAI, + HI_PAUNE, + HI_SADHE, + HI_SAVVA, + NEMO_SPACE, + GraphFst, +) from nemo_text_processing.text_normalization.hi.utils import get_abs_path +HI_ONE_HALF = "१/२" # 1/2 +HI_ONE_QUARTER = "१/४" # 1/4 +HI_THREE_QUARTERS = "३/४" # 3/4 + class FractionFst(GraphFst): """ @@ -40,37 +52,62 @@ def __init__(self, cardinal, deterministic: bool = True): cardinal_graph = cardinal.final_graph self.optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1 + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + pynutil.insert(NEMO_SPACE), 0, 1 ) self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") self.numerator = ( - pynutil.insert("numerator: \"") + cardinal_graph + pynini.cross(pynini.union("/", " / "), "\" ") + pynutil.insert("numerator: \"") + + cardinal_graph + + pynini.cross(pynini.union("/", NEMO_SPACE + "/" + NEMO_SPACE), "\"") + + pynutil.insert(NEMO_SPACE) ) self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"") - dedh_dhai_graph = pynini.string_map([("१ १/२", "डेढ़"), ("२ १/२", "ढाई")]) + dedh_dhai_graph = pynini.string_map( + [("१" + NEMO_SPACE + HI_ONE_HALF, HI_DEDH), ("२" + NEMO_SPACE + HI_ONE_HALF, HI_DHAI)] + ) - savva_numbers = cardinal_graph + pynini.cross(" १/४", "") - savva_graph = pynutil.insert("सवा ") + savva_numbers + savva_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_QUARTER, "") + savva_graph = pynutil.insert(HI_SAVVA) + pynutil.insert(NEMO_SPACE) + savva_numbers - sadhe_numbers = cardinal_graph + pynini.cross(" १/२", "") - sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers + sadhe_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_HALF, "") + sadhe_graph = pynutil.insert(HI_SADHE) + pynutil.insert(NEMO_SPACE) + sadhe_numbers paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) - paune_numbers = paune + pynini.cross(" ३/४", "") - paune_graph = pynutil.insert("पौने ") + paune_numbers - - graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ") + paune_numbers = paune + pynini.cross(NEMO_SPACE + HI_THREE_QUARTERS, "") + paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers + + graph_dedh_dhai = ( + pynutil.insert("morphosyntactic_features: \"") + + dedh_dhai_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ") + graph_savva = ( + pynutil.insert("morphosyntactic_features: \"") + + savva_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ") + graph_sadhe = ( + pynutil.insert("morphosyntactic_features: \"") + + sadhe_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ") + graph_paune = ( + pynutil.insert("morphosyntactic_features: \"") + + paune_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) final_graph = ( self.optional_graph_negative - + pynini.closure(self.integer + pynini.accep(" "), 0, 1) + + pynini.closure(self.integer + pynini.accep(NEMO_SPACE), 0, 1) + self.numerator + self.denominator ) diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 575b3d5d5..b7d74731e 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -15,9 +15,24 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.hi.graph_utils import ( + HI_DEDH, + HI_DHAI, + HI_PAUNE, + HI_SADHE, + HI_SAVVA, + NEMO_SPACE, + GraphFst, + delete_space, + insert_space, +) from nemo_text_processing.text_normalization.hi.utils import get_abs_path +HI_POINT_FIVE = ".५" # .5 +HI_ONE_POINT_FIVE = "१.५" # 1.5 +HI_TWO_POINT_FIVE = "२.५" # 2.5 +HI_DECIMAL_25 = ".२५" # .25 +HI_DECIMAL_75 = ".७५" # .75 digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) @@ -54,7 +69,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) - quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv")) + + # Load quarterly units from separate files: map (FST) and list (FSA) + quarterly_units_map = pynini.string_file(get_abs_path("data/measure/quarterly_units_map.tsv")) + quarterly_units_list = pynini.string_file(get_abs_path("data/measure/quarterly_units_list.tsv")) + quarterly_units_graph = pynini.union(quarterly_units_map, quarterly_units_list) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, @@ -65,16 +84,28 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # Define the quarterly measurements quarter = pynini.string_map( [ - (".५", "साढ़े"), - ("१.५", "डेढ़"), - ("२.५", "ढाई"), + (HI_POINT_FIVE, HI_SADHE), + (HI_ONE_POINT_FIVE, HI_DEDH), + (HI_TWO_POINT_FIVE, HI_DHAI), ] ) quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"") # Define the unit handling - unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ") - units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ") + unit = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + unit_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + units = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + quarterly_units_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) # Handling symbols like x, X, * symbol_graph = pynini.string_map( @@ -94,24 +125,43 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + unit ) - dedh_dhai = pynini.string_map([("१.५", "डेढ़"), ("२.५", "ढाई")]) + dedh_dhai = pynini.string_map([(HI_ONE_POINT_FIVE, HI_DEDH), (HI_TWO_POINT_FIVE, HI_DHAI)]) dedh_dhai_graph = pynutil.insert("integer: \"") + dedh_dhai + pynutil.insert("\"") - savva_numbers = cardinal_graph + pynini.cross(".२५", "") - savva_graph = pynutil.insert("integer: \"सवा ") + savva_numbers + pynutil.insert("\"") + savva_numbers = cardinal_graph + pynini.cross(HI_DECIMAL_25, "") + savva_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_SAVVA) + + pynutil.insert(NEMO_SPACE) + + savva_numbers + + pynutil.insert("\"") + ) - sadhe_numbers = cardinal_graph + pynini.cross(".५", "") - sadhe_graph = pynutil.insert("integer: \"साढ़े ") + sadhe_numbers + pynutil.insert("\"") + sadhe_numbers = cardinal_graph + pynini.cross(HI_POINT_FIVE, "") + sadhe_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_SADHE) + + pynutil.insert(NEMO_SPACE) + + sadhe_numbers + + pynutil.insert("\"") + ) paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) - paune_numbers = paune + pynini.cross(".७५", "") - paune_graph = pynutil.insert("integer: \"पौने ") + paune_numbers + pynutil.insert("\"") + paune_numbers = paune + pynini.cross(HI_DECIMAL_75, "") + paune_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_PAUNE) + + pynutil.insert(NEMO_SPACE) + + paune_numbers + + pynutil.insert("\"") + ) graph_dedh_dhai = ( pynutil.insert("cardinal { ") + optional_graph_negative + dedh_dhai_graph - + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + delete_space + units ) @@ -120,7 +170,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): pynutil.insert("cardinal { ") + optional_graph_negative + savva_graph - + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + delete_space + units ) @@ -129,7 +180,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): pynutil.insert("cardinal { ") + optional_graph_negative + sadhe_graph - + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + delete_space + units ) @@ -149,7 +201,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("integer: \"") + cardinal_graph + pynutil.insert("\"") - + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + delete_space + unit ) @@ -162,9 +215,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + cardinal_graph + pynutil.insert("\"") + pynutil.insert(" }") - + pynutil.insert(" units: \"") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + symbol_graph - + pynutil.insert("\" ") + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + pynutil.insert("} }") + insert_space + pynutil.insert("tokens { cardinal { ") @@ -175,13 +230,13 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) graph = ( - pynutil.add_weight(graph_decimal, 0.01) - | pynutil.add_weight(graph_cardinal, 0.01) - | pynutil.add_weight(graph_exceptions, 0.01) - | pynutil.add_weight(graph_dedh_dhai, 0.001) - | pynutil.add_weight(graph_savva, 0.005) - | pynutil.add_weight(graph_sadhe, 0.005) - | pynutil.add_weight(graph_paune, -0.2) + pynutil.add_weight(graph_decimal, 0.1) + | pynutil.add_weight(graph_cardinal, 0.1) + | pynutil.add_weight(graph_exceptions, 0.1) + | pynutil.add_weight(graph_dedh_dhai, -0.2) + | pynutil.add_weight(graph_savva, -0.1) + | pynutil.add_weight(graph_sadhe, -0.1) + | pynutil.add_weight(graph_paune, -0.5) ) self.graph = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/ordinal.py b/nemo_text_processing/text_normalization/hi/taggers/ordinal.py index 51cbd666a..5f1cefed4 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/ordinal.py @@ -34,9 +34,14 @@ class OrdinalFst(GraphFst): def __init__(self, cardinal: CardinalFst, deterministic: bool = True): super().__init__(name="ordinal", kind="classify", deterministic=deterministic) - suffixes_fst = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv")) + suffixes_list = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv")) + suffixes_map = pynini.string_file(get_abs_path("data/ordinal/suffixes_map.tsv")) + suffixes_fst = pynini.union(suffixes_list, suffixes_map) + exceptions = pynini.string_file(get_abs_path("data/ordinal/exceptions.tsv")) graph = cardinal.final_graph + suffixes_fst + exceptions = pynutil.add_weight(exceptions, -0.1) + graph = pynini.union(exceptions, graph) final_graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/punctuation.py b/nemo_text_processing/text_normalization/hi/taggers/punctuation.py index 8309ba030..14c9a1a55 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/hi/taggers/punctuation.py @@ -36,9 +36,9 @@ def __init__(self, deterministic: bool = True): emphasis = ( pynini.accep("<") - + ( - (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)) - | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1)) + + pynini.union( + (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)), + (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1)), ) + pynini.accep(">") ) diff --git a/nemo_text_processing/text_normalization/hi/taggers/telephone.py b/nemo_text_processing/text_normalization/hi/taggers/telephone.py index 039e30d74..d20870c0d 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/hi/taggers/telephone.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,7 +28,11 @@ ) from nemo_text_processing.text_normalization.hi.utils import get_abs_path -delete_zero = pynutil.delete(pynini.union("0", "०")) +HI_ZERO_DIGIT = pynini.union("0", "०") +HI_MOBILE_START_DIGITS = pynini.union("६", "७", "८", "९", "6", "7", "8", "9").optimize() +HI_LANDLINE_START_DIGITS = pynini.union("२", "३", "४", "६", "2", "3", "4", "6").optimize() + +delete_zero = pynutil.delete(HI_ZERO_DIGIT) delete_zero_optional = pynini.closure(delete_zero, 0, 1) insert_shunya = pynutil.insert('शून्य') + insert_space @@ -41,16 +45,17 @@ credit_context = pynini.string_file(get_abs_path("data/telephone/credit_context.tsv")) pincode_context = pynini.string_file(get_abs_path("data/telephone/pincode_context.tsv")) +# Reusable optimized graph for any digit token +num_token = pynini.union(digit_to_word, digits, zero).optimize() -def generate_mobile(context_keywords): - context_before, context_after = get_context(context_keywords) - allowed_digits = pynini.union("६", "७", "८", "९", "6", "7", "8", "9") +def generate_mobile(context_keywords: pynini.Fst) -> pynini.Fst: + context_before, context_after = get_context(context_keywords) # Filter cardinals to only include allowed digits - mobile_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word + mobile_start_digit = pynini.union(HI_MOBILE_START_DIGITS @ digits, HI_MOBILE_START_DIGITS @ digit_to_word) - country_code_digits = pynini.closure((digit_to_word | digits | zero) + insert_space, 1, 3) + country_code_digits = pynini.closure(num_token + insert_space, 1, 3) country_code = ( pynutil.insert("country_code: \"") + context_before @@ -63,7 +68,7 @@ def generate_mobile(context_keywords): extension_optional = pynini.closure( pynutil.insert("extension: \"") - + pynini.closure((digit_to_word | digits | zero) + insert_space, 1, 3) + + pynini.closure(num_token + insert_space, 1, 3) + context_after + pynutil.insert("\" ") + delete_space, @@ -71,7 +76,7 @@ def generate_mobile(context_keywords): 1, ) - number_part = mobile_start_digit + insert_space + pynini.closure((digit_to_word | digits | zero) + insert_space, 9) + number_part = mobile_start_digit + insert_space + pynini.closure(num_token + insert_space, 9) number_without_country = ( pynutil.insert("number_part: \"") @@ -93,31 +98,27 @@ def generate_mobile(context_keywords): + delete_space ) - return (number_with_country | number_without_country) + extension_optional + return (pynini.union(number_with_country, number_without_country) + extension_optional).optimize() -def get_landline(std_length, context_keywords): +def get_landline(std_length: int, context_keywords: pynini.Fst) -> pynini.Fst: context_before, context_after = get_context(context_keywords) - allowed_digits = pynini.union("२", "३", "४", "६", "2", "3", "4", "6") - # Filter cardinals to only include allowed digits - landline_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word + landline_start_digit = pynini.union(HI_LANDLINE_START_DIGITS @ digits, HI_LANDLINE_START_DIGITS @ digit_to_word) std_code_graph = ( - delete_zero_optional - + insert_shunya - + pynini.closure((digit_to_word | digits | zero) + insert_space, std_length, std_length) + delete_zero_optional + insert_shunya + pynini.closure(num_token + insert_space, std_length, std_length) ) landline_digit_count = 9 - std_length landline_graph = ( landline_start_digit + insert_space - + pynini.closure((digit_to_word | digits | zero) + insert_space, landline_digit_count, landline_digit_count) + + pynini.closure(num_token + insert_space, landline_digit_count, landline_digit_count) ) - separator_optional = pynini.closure(pynini.cross("-", "") | pynini.cross(".", ""), 0, 1) + separator_optional = pynini.closure(pynini.union(pynini.cross("-", ""), pynini.cross(".", "")), 0, 1) std_code_in_brackets = ( delete_zero_optional @@ -140,10 +141,10 @@ def get_landline(std_length, context_keywords): + landline_graph + context_after + pynutil.insert("\" ") - ) + ).optimize() -def generate_landline(context_keywords): +def generate_landline(context_keywords: pynini.Fst) -> pynini.Fst: graph = ( get_landline(2, context_keywords) | get_landline(3, context_keywords) @@ -153,10 +154,10 @@ def generate_landline(context_keywords): | get_landline(7, context_keywords) ) - return graph + return graph.optimize() -def get_context(keywords: list): +def get_context(keywords: pynini.Fst): all_digits = pynini.union(NEMO_HI_DIGIT, NEMO_DIGIT) @@ -172,28 +173,28 @@ def get_context(keywords: list): return before.optimize(), after.optimize() -def generate_credit(context_keywords): +def generate_credit(context_keywords: pynini.Fst) -> pynini.Fst: context_before, context_after = get_context(context_keywords) return ( pynutil.insert("number_part: \"") + context_before - + pynini.closure((digit_to_word | digits | zero) + insert_space, 4) + + pynini.closure(num_token + insert_space, 4) + context_after + pynutil.insert("\" ") + delete_space - ) + ).optimize() -def generate_pincode(context_keywords): +def generate_pincode(context_keywords: pynini.Fst) -> pynini.Fst: context_before, context_after = get_context(context_keywords) return ( pynutil.insert("number_part: \"") + context_before - + pynini.closure((digit_to_word | digits | zero) + insert_space, 6) + + pynini.closure(num_token + insert_space, 6) + context_after + pynutil.insert("\" ") + delete_space - ) + ).optimize() class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hi/taggers/time.py b/nemo_text_processing/text_normalization/hi/taggers/time.py index e78b31380..09defaab2 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/text_normalization/hi/taggers/time.py @@ -15,9 +15,24 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.graph_utils import ( + HI_DEDH, + HI_DHAI, + HI_PAUNE, + HI_SADHE, + HI_SAVVA, + NEMO_SPACE, + GraphFst, + insert_space, +) from nemo_text_processing.text_normalization.hi.utils import get_abs_path +# Time patterns specific to time tagger +HI_DOUBLE_ZERO = "००" +HI_TIME_FIFTEEN = ":१५" # :15 +HI_TIME_THIRTY = ":३०" # :30 +HI_TIME_FORTYFIVE = ":४५" # :45 + hours_graph = pynini.string_file(get_abs_path("data/time/hours.tsv")) minutes_graph = pynini.string_file(get_abs_path("data/time/minutes.tsv")) seconds_graph = pynini.string_file(get_abs_path("data/time/seconds.tsv")) @@ -55,36 +70,56 @@ def __init__(self, cardinal: GraphFst): graph_hm = self.hours + delete_colon + insert_space + self.minutes # hour - graph_h = self.hours + delete_colon + pynutil.delete("००") + graph_h = self.hours + delete_colon + pynutil.delete(HI_DOUBLE_ZERO) - dedh_dhai_graph = pynini.string_map([("१:३०", "डेढ़"), ("२:३०", "ढाई")]) + dedh_dhai_graph = pynini.string_map([("१" + HI_TIME_THIRTY, HI_DEDH), ("२" + HI_TIME_THIRTY, HI_DHAI)]) - savva_numbers = cardinal_graph + pynini.cross(":१५", "") - savva_graph = pynutil.insert("सवा ") + savva_numbers + savva_numbers = cardinal_graph + pynini.cross(HI_TIME_FIFTEEN, "") + savva_graph = pynutil.insert(HI_SAVVA) + pynutil.insert(NEMO_SPACE) + savva_numbers - sadhe_numbers = cardinal_graph + pynini.cross(":३०", "") - sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers + sadhe_numbers = cardinal_graph + pynini.cross(HI_TIME_THIRTY, "") + sadhe_graph = pynutil.insert(HI_SADHE) + pynutil.insert(NEMO_SPACE) + sadhe_numbers paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) - paune_numbers = paune + pynini.cross(":४५", "") - paune_graph = pynutil.insert("पौने ") + paune_numbers - - graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ") + paune_numbers = paune + pynini.cross(HI_TIME_FORTYFIVE, "") + paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers + + graph_dedh_dhai = ( + pynutil.insert("morphosyntactic_features: \"") + + dedh_dhai_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ") + graph_savva = ( + pynutil.insert("morphosyntactic_features: \"") + + savva_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ") + graph_sadhe = ( + pynutil.insert("morphosyntactic_features: \"") + + sadhe_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ") + graph_paune = ( + pynutil.insert("morphosyntactic_features: \"") + + paune_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) final_graph = ( graph_hms - | pynutil.add_weight(graph_hm, 0.01) - | pynutil.add_weight(graph_h, 0.01) - | pynutil.add_weight(graph_dedh_dhai, 0.001) - | pynutil.add_weight(graph_savva, 0.005) - | pynutil.add_weight(graph_sadhe, 0.005) - | pynutil.add_weight(graph_paune, 0.001) + | pynutil.add_weight(graph_hm, 0.3) + | pynutil.add_weight(graph_h, 0.3) + | pynutil.add_weight(graph_dedh_dhai, 0.1) + | pynutil.add_weight(graph_savva, 0.2) + | pynutil.add_weight(graph_sadhe, 0.2) + | pynutil.add_weight(graph_paune, 0.1) ) final_graph = self.add_tokens(final_graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index ceaf74689..e3e6fc5d8 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -14,7 +14,6 @@ import logging import os -import time import pynini from pynini.lib import pynutil @@ -80,61 +79,39 @@ def __init__( else: logging.info(f"Creating ClassifyFst grammars.") - start_time = time.time() cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst - logging.debug(f"cardinal: {time.time() - start_time: .2f}s -- {cardinal_graph.num_states()} nodes") - start_time = time.time() decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst - logging.debug(f"decimal: {time.time() - start_time: .2f}s -- {decimal_graph.num_states()} nodes") - start_time = time.time() fraction = FractionFst(cardinal=cardinal, deterministic=deterministic) fraction_graph = fraction.fst - logging.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes") - start_time = time.time() date = DateFst(cardinal=cardinal) date_graph = date.fst - logging.debug(f"date: {time.time() - start_time: .2f}s -- {date_graph.num_states()} nodes") - start_time = time.time() timefst = TimeFst(cardinal=cardinal) time_graph = timefst.fst - logging.debug(f"time: {time.time() - start_time: .2f}s -- {time_graph.num_states()} nodes") - start_time = time.time() measure = MeasureFst(cardinal=cardinal, decimal=decimal) measure_graph = measure.fst - logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") - start_time = time.time() money = MoneyFst(cardinal=cardinal) money_graph = money.fst - logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") - start_time = time.time() ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) ordinal_graph = ordinal.fst - logging.debug(f"ordinal: {time.time() - start_time: .2f}s -- {ordinal_graph.num_states()} nodes") - start_time = time.time() whitelist_graph = WhiteListFst( input_case=input_case, deterministic=deterministic, input_file=whitelist ).fst - logging.debug(f"whitelist: {time.time() - start_time: .2f}s -- {whitelist_graph.num_states()} nodes") - start_time = time.time() punctuation = PunctuationFst(deterministic=deterministic) punct_graph = punctuation.fst - logging.debug(f"punct: {time.time() - start_time: .2f}s -- {punct_graph.num_states()} nodes") - start_time = time.time() telephone = TelephoneFst() telephone_graph = telephone.fst - logging.debug(f"telephone: {time.time() - start_time: .2f}s -- {telephone_graph.num_states()} nodes") classify = ( pynutil.add_weight(whitelist_graph, 1.01) @@ -149,18 +126,18 @@ def __init__( | pynutil.add_weight(ordinal_graph, 1.1) ) - start_time = time.time() word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst - logging.debug(f"word: {time.time() - start_time: .2f}s -- {word_graph.num_states()} nodes") punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(NEMO_SPACE) + punct), + pynini.union( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space), + (pynutil.insert(NEMO_SPACE) + punct), + ), 1, ) - classify |= pynutil.add_weight(word_graph, 100) + classify = pynini.union(classify, pynutil.add_weight(word_graph, 100)) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( pynini.closure(punct + pynutil.insert(NEMO_SPACE)) @@ -169,15 +146,15 @@ def __init__( ) graph = token_plus_punct + pynini.closure( - ( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(NEMO_SPACE) + punct + pynutil.insert(NEMO_SPACE)) + pynini.union( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space), + (pynutil.insert(NEMO_SPACE) + punct + pynutil.insert(NEMO_SPACE)), ) + token_plus_punct ) graph = delete_space + graph + delete_space - graph |= punct + graph = pynini.union(graph, punct) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/word.py b/nemo_text_processing/text_normalization/hi/taggers/word.py index 151a72e99..00feb1827 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/word.py +++ b/nemo_text_processing/text_normalization/hi/taggers/word.py @@ -40,9 +40,9 @@ def __init__(self, punctuation: PunctuationFst, deterministic: bool = True): # Define Hindi characters and symbols using pynini.union HINDI_CHAR = pynini.union( - *[chr(i) for i in range(ord("ऀ"), ord("ः") + 1)], # Hindi vowels and consonants - *[chr(i) for i in range(ord("अ"), ord("ह") + 1)], # More Hindi characters - *[chr(i) for i in range(ord("ा"), ord("्") + 1)], # Hindi diacritics + *[chr(i) for i in range(0x0900, 0x0903 + 1)], # Hindi vowels and consonants + *[chr(i) for i in range(0x0905, 0x0939 + 1)], # More Hindi characters + *[chr(i) for i in range(0x093E, 0x094D + 1)], # Hindi diacritics ).optimize() # Include punctuation in the graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py index a6a677ec3..55ebeab01 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt index d1a072d0c..9bdcab2a4 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt @@ -1,5 +1,15 @@ +१ला~पहला +१ली~पहली +२रा~दूसरा +२री~दूसरी +३रा~तीसरा +३री~तीसरी +४था~चौथा +४थी~चौथी ५वां~पाँचवां ५वीं~पाँचवीं +६ठा~छठा +६ठी~छठी ७वां~सातवां ७वीं~सातवीं ८वां~आठवां From 6dc912fa10a47e14d3ee04988bf155501443bdaf Mon Sep 17 00:00:00 2001 From: shreeshd-tn Date: Fri, 31 Oct 2025 22:44:22 +0530 Subject: [PATCH 16/16] Cache date change (#356) * Cache date change Signed-off-by: shreeshd-tn * Cache date changes again Signed-off-by: shreeshd-tn --------- Signed-off-by: shreeshd-tn --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index c75d15bdd..b1696c262 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -26,7 +26,7 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-22-25-3' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-31-25-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages {