66#include < cassert>
77
88#include < algorithm>
9+ #include < iterator>
910#include < random>
1011#include < set>
1112
1213#include < gf2/core/Clock.h>
14+ #include < gf2/core/StringUtils.h>
1315
1416namespace gf {
1517
1618 namespace {
1719
18- constexpr char WordLimit = ' #' ;
20+ constexpr char32_t WordLimit = ' #' ;
1921
2022 }
2123
2224 /*
2325 * NameGeneratorModel
2426 */
2527
26- NamegenModel::NamegenModel (const std::vector<std::string >& data, std::size_t order, double prior, std::string alphabet)
28+ NamegenModel::NamegenModel (const std::vector<std::u32string >& data, std::size_t order, double prior, std::u32string alphabet)
2729 : m_order(order)
2830 , m_prior(prior)
2931 , m_alphabet(std::move(alphabet))
@@ -33,7 +35,7 @@ namespace gf {
3335 build_chains ();
3436 }
3537
36- std::optional<char > NamegenModel::generate (const std::string & context, Random& random) const
38+ std::optional<char32_t > NamegenModel::generate (const std::u32string & context, Random& random) const
3739 {
3840 assert (context.size () == m_order);
3941 auto iterator = m_chains.find (context);
@@ -50,21 +52,21 @@ namespace gf {
5052 return m_alphabet[index];
5153 }
5254
53- void NamegenModel::retrain (const std::vector<std::string >& data)
55+ void NamegenModel::retrain (const std::vector<std::u32string >& data)
5456 {
5557 train (data);
5658 build_chains ();
5759 }
5860
59- void NamegenModel::train (const std::vector<std::string >& data)
61+ void NamegenModel::train (const std::vector<std::u32string >& data)
6062 {
61- for (const std::string & item : data) {
62- std::string d = std::string (m_order, WordLimit) + item + WordLimit;
63+ for (const std::u32string & item : data) {
64+ const std::u32string sequence = std::u32string (m_order, WordLimit) + item + WordLimit;
6365
64- for (std::size_t i = 0 ; i < d .size () - m_order; ++i) {
65- const std::string key = d .substr (i, m_order);
66- assert (i + m_order < d .size ());
67- m_observations[key].push_back (d [i + m_order]);
66+ for (std::size_t i = 0 ; i < sequence .size () - m_order; ++i) {
67+ const std::u32string key = sequence .substr (i, m_order);
68+ assert (i + m_order < sequence .size ());
69+ m_observations[key].push_back (sequence [i + m_order]);
6870 }
6971 }
7072 }
@@ -76,7 +78,7 @@ namespace gf {
7678 for (auto & [ context, observation ] : m_observations) {
7779 std::vector<double >& values = m_chains[context];
7880
79- for (char prediction : m_alphabet) {
81+ for (char32_t prediction : m_alphabet) {
8082 values.push_back (m_prior + static_cast <double >(std::count (observation.begin (), observation.end (), prediction)));
8183 }
8284 }
@@ -86,20 +88,20 @@ namespace gf {
8688 * NamegenGenerator
8789 */
8890
89- NamegenGenerator::NamegenGenerator (const std::vector<std::string >& data, std::size_t order, double prior, bool backoff)
91+ NamegenGenerator::NamegenGenerator (const std::vector<std::u32string >& data, std::size_t order, double prior, bool backoff)
9092 : m_order(order)
9193 , m_prior(prior)
9294 , m_backoff(backoff)
9395 {
94- std::set<char > letters;
96+ std::set<char32_t > letters;
9597
96- for (const std::string & item : data) {
97- for (const char c : item) {
98+ for (const std::u32string & item : data) {
99+ for (const char32_t c : item) {
98100 letters.insert (c);
99101 }
100102 }
101103
102- std::string alphabet (letters.begin (), letters.end ());
104+ std::u32string alphabet (letters.begin (), letters.end ());
103105 alphabet.push_back (WordLimit);
104106
105107 if (backoff) {
@@ -111,9 +113,9 @@ namespace gf {
111113 }
112114 }
113115
114- std::string NamegenGenerator::generate (Random& random) const
116+ std::u32string NamegenGenerator::generate (Random& random) const
115117 {
116- std::string word (m_order, WordLimit);
118+ std::u32string word (m_order, WordLimit);
117119
118120 auto maybe_letter = compute_letter (word, random);
119121
@@ -125,11 +127,11 @@ namespace gf {
125127 return word;
126128 }
127129
128- std::optional<char > NamegenGenerator::compute_letter (const std::string & word, Random& random) const
130+ std::optional<char32_t > NamegenGenerator::compute_letter (const std::u32string & word, Random& random) const
129131 {
130132 assert (word.size () >= m_order);
131133
132- std::string context = word.substr (word.size () - m_order);
134+ std::u32string context = word.substr (word.size () - m_order);
133135 assert (context.size () == m_order);
134136
135137 for (const NamegenModel& model : m_models) {
@@ -150,13 +152,20 @@ namespace gf {
150152 */
151153
152154 namespace {
155+ std::vector<std::u32string> to_utf32_strings (const std::vector<std::string>& data)
156+ {
157+ std::vector<std::u32string> utf32_data;
158+ std::transform (data.begin (), data.end (), std::back_inserter (utf32_data), to_utf32);
159+ return utf32_data;
160+ }
153161
154- bool satisfy_settings (const std::string & word, const NamegenSettings& settings)
162+ bool satisfy_size_settings (const std::u32string & word, const NamegenSettings& settings)
155163 {
156- if (word.size () < settings.min_length || word.size () > settings.max_length ) {
157- return false ;
158- }
164+ return settings.min_length <= word.size () && word.size () <= settings.max_length ;
165+ }
159166
167+ bool satisfy_settings (const std::string& word, const NamegenSettings& settings)
168+ {
160169 if (settings.starts_with .size () > word.size () || word.substr (0 , settings.starts_with .size ()) != settings.starts_with ) {
161170 return false ;
162171 }
@@ -179,17 +188,23 @@ namespace gf {
179188 }
180189
181190 NamegenManager::NamegenManager (const std::vector<std::string>& data, std::size_t order, double prior, bool backoff)
182- : m_generator(data, order, prior, backoff)
191+ : m_generator(to_utf32_strings( data) , order, prior, backoff)
183192 {
184193 }
185194
186195 std::optional<std::string> NamegenManager::generate_single (Random& random, const NamegenSettings& settings) const
187196 {
188- std::string name = m_generator.generate (random);
197+ std::u32string name = m_generator.generate (random);
189198 name.erase (std::remove (name.begin (), name.end (), WordLimit), name.end ());
190199
191- if (satisfy_settings (name, settings)) {
192- return name;
200+ if (!satisfy_size_settings (name, settings)) {
201+ return std::nullopt ;
202+ }
203+
204+ std::string utf8_name = to_utf8 (name);
205+
206+ if (satisfy_settings (utf8_name, settings)) {
207+ return utf8_name;
193208 }
194209
195210 return std::nullopt ;
0 commit comments