NVIDIA · mayuris-00 · Jun 29, 2026 · Jun 30, 2026
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/roman/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/roman/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/roman/key_words.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/roman/key_words.tsv
@@ -0,0 +1,4 @@
+अध्याय
+खंड
+खण्ड
+कक्षा
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/roman/roman_numerals.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/roman/roman_numerals.tsv
@@ -0,0 +1,13 @@
+I	1
+V	5
+X	10
+L	50
+C	100
+D	500
+M	1000
+IV	4
+IX	9
+XL	40
+XC	90
+CD	400
+CM	900
diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/roman.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/roman.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.inverse_text_normalization.hi.graph_utils import (
+    DEVANAGARI_DIGIT,
+    NEMO_SIGMA,
+    GraphFst,
+    delete_space,
+    insert_space,
+)
+from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path, load_labels
+
+
+class RomanFst(GraphFst):
+    """
+    Finite state transducer for classifying spoken numbers as Roman numerals
+    when they follow a small, fixed set of context key words (chapter, volume,
+    class numbering). The conversion is deliberately restricted to these
+    predictable contexts; regnal, papal and product names (e.g. भास्कर-II) are a
+    documented limitation because the same number is ambiguous between Arabic and
+    Roman form.
+        e.g. अध्याय तीन -> tokens { roman { key: "अध्याय" integer: "III" } }
+        e.g. कक्षा दस -> tokens { roman { key: "कक्षा" integer: "X" } }
+
+    Args:
+        cardinal: CardinalFst, used to read spoken numbers.
+    """
+
+    MAX_NUMBER = 3999
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="roman", kind="classify")
+
+        key_words = [label[0] for label in load_labels(get_abs_path("data/roman/key_words.tsv"))]
+        key_words_fst = pynini.union(*[pynini.accep(word) for word in key_words]).optimize()
+
+        roman_to_value = {
+            roman: int(value) for roman, value in load_labels(get_abs_path("data/roman/roman_numerals.tsv"))
+        }
+        value_to_roman = {value: roman for roman, value in roman_to_value.items()}
+
+        not_quote = pynini.closure(pynini.difference(NEMO_SIGMA, pynini.accep('"')), 1)
+        strip_cardinal_tags = pynutil.delete('cardinal { integer: "') + not_quote + pynutil.delete('" }')
+        cardinal_to_devanagari = pynini.compose(cardinal.fst, strip_cardinal_tags).optimize()
+
+        single_digit_to_devanagari = (
+            pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
+            | pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()
+        )
+        glyph_to_ascii = pynini.union(
+            *[pynini.cross(glyph, str(value)) for value, glyph in enumerate(DEVANAGARI_DIGIT)]
+        )
+        devanagari_to_ascii = pynini.cdrewrite(glyph_to_ascii, "", "", NEMO_SIGMA)
+        spoken_to_ascii = pynini.compose(
+            cardinal_to_devanagari | single_digit_to_devanagari, devanagari_to_ascii
+        ).optimize()
+
+        ascii_to_roman = pynini.string_map(
+            [(str(value), self._int_to_roman(value, value_to_roman)) for value in range(1, self.MAX_NUMBER + 1)]
+        ).optimize()
+        spoken_to_roman = pynini.compose(spoken_to_ascii, ascii_to_roman).optimize()
+
+        graph = (
+            pynutil.insert("key: \"")
+            + key_words_fst
+            + pynutil.insert("\"")
+            + delete_space
+            + insert_space
+            + pynutil.insert("integer: \"")
+            + spoken_to_roman
+            + pynutil.insert("\"")
+        )
+        self.fst = self.add_tokens(graph).optimize()
+
+    def _int_to_roman(self, number, value_to_roman):
+        roman = ""
+        for value in sorted(value_to_roman, reverse=True):
+            while number >= value:
+                roman += value_to_roman[value]
+                number -= value
+        return roman
diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py
@@ -33,6 +33,7 @@
 from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.roman import RomanFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst
@@ -89,6 +90,8 @@ def __init__(
             money_graph = money.fst
             telephone = TelephoneFst(cardinal)
             telephone_graph = telephone.fst
+            roman = RomanFst(cardinal)
+            roman_graph = roman.fst
             punct_graph = PunctuationFst().fst
             whitelist_graph = WhiteListFst().fst
             word_graph = WordFst().fst
@@ -103,6 +106,7 @@ def __init__(
                 | pynutil.add_weight(measure_graph, 1.1)
                 | pynutil.add_weight(money_graph, 1.1)
                 | pynutil.add_weight(telephone_graph, 1.1)
+                | pynutil.add_weight(roman_graph, 1.1)
                 | pynutil.add_weight(word_graph, 100)
                 | pynutil.add_weight(whitelist_graph, 1.01)
             )

diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/roman.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/roman.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.inverse_text_normalization.hi.graph_utils import (
+    NEMO_NOT_QUOTE,
+    GraphFst,
+    delete_space,
+    insert_space,
+)
+
+
+class RomanFst(GraphFst):
+    """
+    Finite state transducer for verbalizing Roman numerals
+        e.g. tokens { roman { key: "अध्याय" integer: "III" } } -> अध्याय III
+    """
+
+    def __init__(self):
+        super().__init__(name="roman", kind="verbalize")
+        key = pynutil.delete("key: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
+        integer = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
+        graph = key + delete_space + insert_space + integer
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py
@@ -21,6 +21,7 @@
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.measure import MeasureFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst
+from nemo_text_processing.inverse_text_normalization.hi.verbalizers.roman import RomanFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst
@@ -48,6 +49,7 @@ def __init__(self):
         measure_graph = MeasureFst(cardinal, decimal).fst
         money_graph = MoneyFst(cardinal, decimal).fst
         telephone_graph = TelephoneFst(cardinal).fst
+        roman_graph = RomanFst().fst
         word_graph = WordFst().fst
         whitelist_graph = WhiteListFst().fst
 
@@ -63,5 +65,6 @@ def __init__(self):
             | measure_graph
             | money_graph
             | telephone_graph
+            | roman_graph
         )
         self.fst = graph
diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_roman.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_roman.txt
@@ -0,0 +1,11 @@
+अध्याय एक~अध्याय I
+अध्याय तीन~अध्याय III
+अध्याय चार~अध्याय IV
+अध्याय नौ~अध्याय IX
+खंड पाँच~खंड V
+खण्ड सात~खण्ड VII
+कक्षा दस~कक्षा X
+कक्षा बारह~कक्षा XII
+अध्याय बीस~अध्याय XX
+अध्याय चालीस~अध्याय XL
+अध्याय निन्यानवे~अध्याय XCIX
diff --git a/tests/nemo_text_processing/hi/test_roman.py b/tests/nemo_text_processing/hi/test_roman.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from parameterized import parameterized
+
+from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+
+from ..utils import CACHE_DIR, parse_test_case_file
+
+
+class TestRoman:
+    inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)
+
+    @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_roman.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_denorm(self, test_input, expected):
+        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
+        assert pred.strip() == expected.strip()
diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh
@@ -83,6 +83,11 @@ testITNWhiteList() {
   runtest $input
 }
 
+testITNRoman() {
+  input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_roman.txt
+  runtest $input
+}
+
 
 # Load shUnit2
 . $PROJECT_DIR/../shunit2/shunit2
-Original file line number
+Diff line change
@@ -0,0 +1,13 @@
+    I	1
+    V	5
+    X	10
+    L	50
+    C	100
+    D	500
+    M	1000
+    IV	4
+    IX	9
+    XL	40
+    XC	90
+    CD	400
+    CM	900