1use std::{collections::HashMap, fmt, str, sync::Arc};
2
3use thiserror::Error;
4use unicode_segmentation::{Graphemes, UnicodeSegmentation};
5
6type Grapheme = Box<str>;
7
8#[derive(Debug, Error)]
9#[error("Input {input} is not a grapheme")]
10pub struct NotGrapheme {
11 pub input: String,
12}
13
14#[derive(Debug, Error)]
15#[error("Grapheme id {id} is unknwon")]
16pub struct UnknownId {
17 pub id: Id,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
21pub struct Id {
22 bits: u64,
23}
24
25impl Id {
26 fn from_index(index: usize) -> Self {
27 let max_code = u64::from(char::MAX);
28 let bits = u64::try_from(index)
29 .ok()
30 .and_then(|bits| bits.checked_add(max_code + 1))
31 .expect("index could not be so large");
32 Self { bits }
33 }
34}
35
36impl From<char> for Id {
37 fn from(value: char) -> Self {
38 Self { bits: u64::from(value) }
39 }
40}
41
42impl fmt::Display for Id {
43 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
44 write!(f, "{}", self.bits)
45 }
46}
47
48impl PartialEq<char> for Id {
49 fn eq(&self, other: &char) -> bool {
50 *self == Self::from(*other)
51 }
52}
53
54impl PartialEq<Id> for char {
55 fn eq(&self, other: &Id) -> bool {
56 Id::from(*self) == *other
57 }
58}
59
60#[derive(Debug)]
61struct RegistryInner {
62 index_to_string: Vec<Grapheme>,
63 string_to_id: HashMap<Grapheme, Id>,
64}
65
66impl RegistryInner {
67 pub fn new() -> Self {
68 Self { index_to_string: Vec::new(), string_to_id: HashMap::new() }
69 }
70
71 pub fn index_to_string(&self, index: usize) -> Option<&str> {
72 self.index_to_string.get(index).map(AsRef::as_ref)
73 }
74
75 pub fn get_or_register(&mut self, grapheme: &str) -> Id {
76 match self.string_to_id.get(grapheme) {
77 Some(id) => *id,
78 None => self.register(grapheme),
79 }
80 }
81
82 pub fn register(&mut self, grapheme: &str) -> Id {
83 let index = self.index_to_string.len();
84 let id = Id::from_index(index);
85 self.index_to_string.push(grapheme.into());
86 self.string_to_id.insert(grapheme.into(), id);
87 id
88 }
89}
90
91#[derive(Debug, Clone)]
92pub struct Registry {
93 inner: Arc<std::sync::Mutex<RegistryInner>>,
94}
95
96impl Registry {
97 pub fn new() -> Self {
98 Self { inner: Arc::new(std::sync::Mutex::new(RegistryInner::new())) }
99 }
100
101 pub fn get_or_register_many<'r, 'g>(
102 &'r self,
103 graphemes: &'g str,
104 ) -> GetOrRegisterMany<'r, 'g> {
105 GetOrRegisterMany {
106 registry: self,
107 graphemes: graphemes.graphemes(true),
108 }
109 }
110
111 pub fn get_or_register(&self, grapheme: &str) -> Result<Id, NotGrapheme> {
112 let mut iter = grapheme.graphemes(true);
113 if iter.next().is_none() {
114 Err(NotGrapheme { input: grapheme.into() })?;
115 }
116 if iter.next().is_some() {
117 Err(NotGrapheme { input: grapheme.into() })?;
118 }
119 Ok(self.get_or_register_unchecked(grapheme))
120 }
121
122 pub fn lookup<F, T>(&self, id: Id, scope: F) -> T
123 where
124 F: FnOnce(Result<GraphemeChars<'_>, UnknownId>) -> T,
125 {
126 let max_char = u64::from(char::MAX);
127 if let Some(bits) = id.bits.checked_sub(max_char + 1) {
128 let index = usize::try_from(bits)
129 .expect("id bits should have been constructed from index");
130 let inner = self.inner.lock().expect("poisoned lock");
131 let result = inner
132 .index_to_string(index)
133 .map(GraphemeChars::multiple)
134 .ok_or(UnknownId { id });
135 scope(result)
136 } else {
137 let code = u32::try_from(id.bits)
138 .ok()
139 .and_then(char::from_u32)
140 .expect("already checked for char range");
141 let result = Ok(GraphemeChars::single(code));
142 scope(result)
143 }
144 }
145
146 fn get_or_register_unchecked(&self, grapheme: &str) -> Id {
147 let mut chars = grapheme.chars();
148 if let Some(ch) = chars.next() {
149 if chars.next().is_none() {
150 return Id::from(ch);
151 }
152 }
153
154 let mut inner = self.inner.lock().expect("poisoned lock");
155 inner.get_or_register(grapheme)
156 }
157}
158
159#[derive(Debug)]
160pub struct GetOrRegisterMany<'r, 'g> {
161 registry: &'r Registry,
162 graphemes: Graphemes<'g>,
163}
164
165impl<'r, 'g> Iterator for GetOrRegisterMany<'r, 'g> {
166 type Item = Id;
167
168 fn next(&mut self) -> Option<Self::Item> {
169 let grapheme = self.graphemes.next()?;
170 Some(self.registry.get_or_register_unchecked(grapheme))
171 }
172}
173
174#[derive(Debug, Clone)]
175pub struct GraphemeChars<'r> {
176 inner: GraphemeCharsInner<'r>,
177}
178
179impl<'r> GraphemeChars<'r> {
180 fn single(ch: char) -> Self {
181 Self { inner: GraphemeCharsInner::Single(Some(ch)) }
182 }
183
184 fn multiple(content: &'r str) -> Self {
185 Self { inner: GraphemeCharsInner::Multiple(content.chars()) }
186 }
187}
188
189impl<'r> Iterator for GraphemeChars<'r> {
190 type Item = char;
191
192 fn next(&mut self) -> Option<Self::Item> {
193 match &mut self.inner {
194 GraphemeCharsInner::Single(ch) => ch.take(),
195 GraphemeCharsInner::Multiple(iter) => iter.next(),
196 }
197 }
198
199 fn size_hint(&self) -> (usize, Option<usize>) {
200 match &self.inner {
201 GraphemeCharsInner::Single(None) => (0, Some(0)),
202 GraphemeCharsInner::Single(Some(_)) => (1, Some(1)),
203 GraphemeCharsInner::Multiple(iter) => iter.size_hint(),
204 }
205 }
206}
207
208#[derive(Debug, Clone)]
209enum GraphemeCharsInner<'r> {
210 Single(Option<char>),
211 Multiple(str::Chars<'r>),
212}
213
214#[cfg(test)]
215mod test {
216 use super::{Id, Registry};
217
218 #[test]
219 fn id_from_char_is_char_ascii() {
220 let actual = Id::from('a').bits;
221 let expected = 'a' as u64;
222 assert_eq!(expected, actual);
223 }
224
225 #[test]
226 fn id_from_char_is_char_unicode() {
227 let actual = Id::from('á').bits;
228 let expected = 'á' as u64;
229 assert_eq!(expected, actual);
230 }
231
232 #[test]
233 fn register_single_char_grapheme_ascii() {
234 let registry = Registry::new();
235 let id = registry.get_or_register("a").unwrap();
236 let actual: String =
237 registry.lookup(id, |result| result.unwrap().collect());
238 let expected = "a";
239 assert_eq!(expected, actual);
240
241 let expected = "a";
242 let actual: String =
243 registry.lookup(id, |result| result.unwrap().collect());
244 assert_eq!(expected, actual);
245 }
246
247 #[test]
248 fn register_single_char_grapheme_unicode() {
249 let registry = Registry::new();
250 let id = registry.get_or_register("á").unwrap();
251 let actual: String =
252 registry.lookup(id, |result| result.unwrap().collect());
253 let expected = "á";
254 assert_eq!(expected, actual);
255
256 let expected = "á";
257 let actual: String =
258 registry.lookup(id, |result| result.unwrap().collect());
259 assert_eq!(expected, actual);
260 }
261
262 #[test]
263 fn register_single_grapheme_cluster() {
264 let registry = Registry::new();
265 let id = registry.get_or_register("b̥").unwrap();
266 let actual: String =
267 registry.lookup(id, |result| result.unwrap().collect());
268 let expected = "b̥";
269 assert_eq!(expected, actual);
270
271 let expected = "b̥";
272 let actual: String =
273 registry.lookup(id, |result| result.unwrap().collect());
274 assert_eq!(expected, actual);
275 }
276
277 #[test]
278 fn register_many() {
279 let registry = Registry::new();
280 let ids: Vec<_> = registry.get_or_register_many("ab̥á").collect();
281 let mut actual = Vec::<String>::new();
282 for &id in &ids {
283 let actual_elem =
284 registry.lookup(id, |result| result.unwrap().collect());
285 actual.push(actual_elem);
286 }
287 let expected = ["a", "b̥", "á"].map(ToOwned::to_owned);
288 assert_eq!(&expected[..], &actual[..]);
289
290 let expected = "a";
291 let actual: String =
292 registry.lookup(ids[0], |result| result.unwrap().collect());
293 assert_eq!(expected, actual);
294
295 let expected = "b̥";
296 let actual: String =
297 registry.lookup(ids[1], |result| result.unwrap().collect());
298 assert_eq!(expected, actual);
299
300 let expected = "á";
301 let actual: String =
302 registry.lookup(ids[2], |result| result.unwrap().collect());
303 assert_eq!(expected, actual);
304 }
305}