Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
d4db3eb
PR: Add Vietnamese text normalization for cardinal semiotic class (#289)
folivoramanh Jun 18, 2025
f95aeda
Ordinal and Decimal for Vietnamese TN (#290)
folivoramanh Jun 26, 2025
a8597a8
Vietnamese TN - Fraction (#296)
folivoramanh Jun 30, 2025
95fc65d
Date Semiotic Class for Vietnamese TN (#298)
folivoramanh Jul 9, 2025
e4c8e3c
Time - semiotic class for Vietnamese TN (#302)
folivoramanh Jul 22, 2025
af2082d
Add Vietnamese TN support for Money and Range semiotic classes (#304)
folivoramanh Aug 1, 2025
bfa8eef
Add Vietnamese measure text normalization support (#307)
folivoramanh Aug 12, 2025
b8506ad
Vietnamese MRC 1.0 fix case (#312)
folivoramanh Aug 19, 2025
7dc6e1f
Fix Jenkinsfile for CI (#325) (#327)
folivoramanh Oct 10, 2025
1252387
Fix word range (#334)
folivoramanh Oct 12, 2025
10cb69d
Date time itn (#333)
folivoramanh Oct 12, 2025
edb11ca
Staging vi tn signed off (#339)
folivoramanh Oct 13, 2025
ea3fde6
Comma bugfix for En electronics (#332)
mgrafu Sep 25, 2025
a520a2b
remove unuse import (#340)
folivoramanh Oct 15, 2025
fcebf60
Update Jenkinsfile (#341)
anand-nv Oct 15, 2025
8c4055b
[pre-commit.ci] pre-commit suggestions (#335)
pre-commit-ci[bot] Oct 15, 2025
6baae96
update jenkins cache
mgrafu Oct 16, 2025
997042e
fill missing lang in arg run (#347)
folivoramanh Oct 29, 2025
8fcb369
Staging vi tn DCO fixed (#354)
folivoramanh Oct 29, 2025
87328af
Merge branch 'main' into staging_vi_tn
mgrafu Oct 29, 2025
ba801d7
update vi cache date
mgrafu Oct 29, 2025
2a41b81
Refactor Vietnamese (#357)
folivoramanh Nov 3, 2025
c7a1c21
delete unuse import (#358)
folivoramanh Nov 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pipeline {
HU_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/07-16-24-0'
PT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
RU_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
VI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
VI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-29-25-0'
SV_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
ZH_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/11-13-24-0'
IT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-22-24-0'
Expand Down Expand Up @@ -171,7 +171,7 @@ pipeline {
}
}

stage('L0: Create FR TN/ITN & VI ITN & HU TN & IT TN') {
stage('L0: Create FR TN/ITN & VI TN/ITN & HU TN & IT TN') {
when {
anyOf {
branch 'main'
Expand All @@ -197,6 +197,11 @@ pipeline {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=vi --text="một ngàn " --cache_dir ${VI_TN_CACHE}'
}
}
stage('L0: VI TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=vi --text="100" --cache_dir ${VI_TN_CACHE}'
}
}
stage('L0: HU TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hu --text="100" --cache_dir ${HU_TN_CACHE}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ $ đô la mỹ
₩ won
₩ uôn
RM ringgit
₫ đồng
£ bảng anh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
- gạch
_ gạch dưới
_ shift gạch
_ shift trừ
_ síp gạch
! chấm than
# thăng
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
. chấm
- gạch
- gạch ngang
_ gạch dưới
_ shift gạch
_ shift trừ
_ síp gạch
/ sẹc
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
mốt 1
tư 4
lăm 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
linh 0
lẻ 0
không 0
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
NEMO_SPACE = " "
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize()
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, '"').optimize()

NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()
Expand All @@ -47,6 +47,7 @@
delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
insert_space = pynutil.insert(" ")
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
delete_single_space = pynutil.delete(NEMO_SPACE)

# French frequently compounds numbers with hyphen.
delete_hyphen = pynutil.delete(pynini.closure("-", 0, 1))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,117 +36,120 @@ class CardinalFst(GraphFst):

def __init__(self):
super().__init__(name="cardinal", kind="classify")
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))

thousand_words = pynini.union("ngàn", "nghìn")
negative_words = pynini.union("âm", "trừ")

graph_hundred = pynini.cross("trăm", "")
graph_ten = pynini.cross("mươi", "")
zero = pynini.cross(pynini.union("linh", "lẻ"), "0")

graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_one = pynini.cross("mốt", "1")
graph_four = pynini.cross("tư", "4")
graph_five = pynini.cross("lăm", "5")
graph_half = pynini.cross("rưỡi", "5")
graph_hundred = pynini.cross("trăm", "")
graph_ten = pynini.cross("mươi", "")
zero = pynini.cross(pynini.union("linh", "lẻ"), "0")

optional_ten = pynini.closure(delete_space + graph_ten, 0, 1)
last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input")
last_digit = pynini.union(
self.last_digit = pynini.union(
(pynini.project(graph_digit, "input") - last_digit_exception.arcsort()) @ graph_digit,
graph_one,
graph_four,
graph_five,
)

graph_hundred_ties_component = (graph_digit | graph_zero) + delete_space + graph_hundred
graph_hundred_ties_component += delete_space
graph_hundred_ties_component += pynini.union(
last_digit = self.last_digit
# Build hundreds component (e.g., "một trăm", "hai trăm")
graph_hundreds_component = (graph_digit | graph_zero) + delete_space + graph_hundred
graph_hundreds_component += delete_space
graph_hundreds_component += pynini.union(
graph_teen,
(graph_half | graph_four | graph_one) + pynutil.insert("0"),
graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0")),
zero + delete_space + (graph_digit | graph_four),
pynutil.insert("00"),
)
graph_hundred_ties_component |= (
(graph_half | graph_four | graph_one) + pynutil.insert("0", weight=0.1),
graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0", weight=0.1)),
zero + delete_space + (graph_digit | graph_four | graph_five),
pynutil.insert("00", weight=0.1),
).optimize()
graph_hundreds_component |= (
pynutil.insert("0")
+ delete_space
+ pynini.union(
graph_teen,
graph_ties + optional_ten + delete_space + last_digit,
graph_ties + delete_space + graph_ten + pynutil.insert("0"),
zero + delete_space + (graph_digit | graph_four),
)
graph_ties + delete_space + graph_ten + pynutil.insert("0", weight=0.1),
zero + delete_space + (graph_digit | graph_four | graph_five),
).optimize()
)
graph_hundred_component = graph_hundreds_component | (
pynutil.insert("00", weight=0.1) + delete_space + graph_digit
)
graph_hundred_component = graph_hundred_ties_component | (pynutil.insert("00") + delete_space + graph_digit)

graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)
)
self.graph_hundred_component_at_least_one_none_zero_digit = (
graph_hundred_component_at_least_one_none_zero_digit
graph_hundred_component_at_least_one_none_zero_digit.optimize()
)
graph_hundred_ties_zero = graph_hundred_ties_component | pynutil.insert("000")
graph_hundreds_zero = graph_hundreds_component | pynutil.insert("000", weight=0.1)

graph_thousands = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete(pynini.union("nghìn", "ngàn")),
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(thousand_words),
pynutil.insert("000", weight=0.1),
)

graph_ten_thousand = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("vạn"),
pynutil.insert("0000", weight=0.1),
)

graph_ten_thousand_suffix = pynini.union(
graph_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")),
pynutil.insert("0", weight=0.1),
)
).optimize()

graph_million = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("triệu"),
pynutil.insert("000", weight=0.1),
)
).optimize()
graph_billion = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete(pynini.union("tỉ", "tỷ")),
pynutil.insert("000", weight=0.1),
)
).optimize()

# Main graph combining all magnitude levels
graph = pynini.union(
# Full format: billion + million + thousand + hundred
graph_billion
+ delete_space
+ graph_million
+ delete_space
+ graph_thousands
+ delete_space
+ graph_hundred_ties_zero,
graph_ten_thousand + delete_space + graph_ten_thousand_suffix + delete_space + graph_hundred_ties_zero,
+ graph_hundreds_zero,
# Special thousand format with last digit or "rưỡi" (half)
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete(pynini.union("nghìn", "ngàn"))
+ pynutil.delete(thousand_words)
+ delete_space
+ (((last_digit | graph_half) + pynutil.insert("00")) | graph_hundred_ties_zero),
+ pynini.union(
pynini.union(last_digit, graph_half) + pynutil.insert("00", weight=0.1), graph_hundreds_zero
),
# Single digits (for non-exception cases)
graph_digit,
graph_zero,
)

graph = graph @ pynini.union(
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT),
"0",
graph = (
graph
@ pynini.union(
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT),
"0",
).optimize()
)

# don't convert cardinals from zero to nine inclusive
graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), "input")
single_digits = pynini.project(pynini.union(graph_digit, graph_zero), "input").optimize()

self.graph_no_exception = graph

self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
self.graph = pynini.difference(pynini.project(graph, "input"), single_digits) @ graph

optional_minus_graph = pynini.closure(
pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE,
pynutil.insert("negative: ") + pynini.cross(negative_words, '"-"') + NEMO_SPACE,
0,
1,
)
Expand Down
Loading
Loading