1use std::{collections::HashMap, fmt, str, sync::Arc};
2
3use thiserror::Error;
4use unicode_segmentation::{Graphemes, UnicodeSegmentation};
5
6type Grapheme = Box<str>;
7
8#[derive(Debug, Error)]
9#[error("Input {input} is not a grapheme")]
10pub struct NotGrapheme {
11 pub input: String,
12}
13
14#[derive(Debug, Error)]
15#[error("Grapheme id {id} is unknwon")]
16pub struct UnknownId {
17 pub id: Id,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
21pub struct Id {
22 bits: u64,
23}
24
25impl Id {
26 fn from_index(index: usize) -> Self {
27 let max_code = u64::from(char::MAX);
28 let bits = u64::try_from(index)
29 .ok()
30 .and_then(|bits| bits.checked_add(max_code + 1))
31 .expect("index could not be so large");
32 Self { bits }
33 }
34}
35
36impl From<char> for Id {
37 fn from(value: char) -> Self {
38 Self { bits: u64::from(value) }
39 }
40}
41
42impl fmt::Display for Id {
43 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
44 write!(f, "{}", self.bits)
45 }
46}
47
48impl PartialEq<char> for Id {
49 fn eq(&self, other: &char) -> bool {
50 *self == Self::from(*other)
51 }
52}
53
54impl PartialEq<Id> for char {
55 fn eq(&self, other: &Id) -> bool {
56 Id::from(*self) == *other
57 }
58}
59
60#[derive(Debug)]
61struct RegistryInner {
62 index_to_string: Vec<Grapheme>,
63 string_to_id: HashMap<Grapheme, Id>,
64}
65
66impl RegistryInner {
67 pub fn new() -> Self {
68 Self { index_to_string: Vec::new(), string_to_id: HashMap::new() }
69 }
70
71 pub fn index_to_string(&self, index: usize) -> Option<&str> {
72 self.index_to_string.get(index).map(AsRef::as_ref)
73 }
74
75 pub fn get_or_register(&mut self, grapheme: &str) -> Id {
76 match self.string_to_id.get(grapheme) {
77 Some(id) => *id,
78 None => self.register(grapheme),
79 }
80 }
81
82 pub fn register(&mut self, grapheme: &str) -> Id {
83 let index = self.index_to_string.len();
84 let id = Id::from_index(index);
85 self.index_to_string.push(grapheme.into());
86 self.string_to_id.insert(grapheme.into(), id);
87 id
88 }
89}
90
91#[derive(Debug, Clone)]
92pub struct Registry {
93 inner: Arc<std::sync::Mutex<RegistryInner>>,
94}
95
96impl Registry {
97 pub fn new() -> Self {
98 Self { inner: Arc::new(std::sync::Mutex::new(RegistryInner::new())) }
99 }
100
101 pub fn get_or_register_many<'r, 'g>(
102 &'r self,
103 graphemes: &'g str,
104 ) -> GetOrRegisterMany<'r, 'g> {
105 GetOrRegisterMany {
106 registry: self,
107 graphemes: graphemes.graphemes(true),
108 }
109 }
110
111 pub fn len_of(&self, graphemes: &str) -> usize {
112 graphemes.graphemes(true).count()
113 }
114
115 pub fn get_or_register(&self, grapheme: &str) -> Result<Id, NotGrapheme> {
116 let mut iter = grapheme.graphemes(true);
117 if iter.next().is_none() {
118 Err(NotGrapheme { input: grapheme.into() })?;
119 }
120 if iter.next().is_some() {
121 Err(NotGrapheme { input: grapheme.into() })?;
122 }
123 Ok(self.get_or_register_unchecked(grapheme))
124 }
125
126 pub fn lookup<F, T>(&self, id: Id, scope: F) -> T
127 where
128 F: FnOnce(Result<GraphemeChars<'_>, UnknownId>) -> T,
129 {
130 let max_char = u64::from(char::MAX);
131 if let Some(bits) = id.bits.checked_sub(max_char + 1) {
132 let index = usize::try_from(bits)
133 .expect("id bits should have been constructed from index");
134 let inner = self.inner.lock().expect("poisoned lock");
135 let result = inner
136 .index_to_string(index)
137 .map(GraphemeChars::multiple)
138 .ok_or(UnknownId { id });
139 scope(result)
140 } else {
141 let code = u32::try_from(id.bits)
142 .ok()
143 .and_then(char::from_u32)
144 .expect("already checked for char range");
145 let result = Ok(GraphemeChars::single(code));
146 scope(result)
147 }
148 }
149
150 fn get_or_register_unchecked(&self, grapheme: &str) -> Id {
151 let mut chars = grapheme.chars();
152 if let Some(ch) = chars.next() {
153 if chars.next().is_none() {
154 return Id::from(ch);
155 }
156 }
157
158 let mut inner = self.inner.lock().expect("poisoned lock");
159 inner.get_or_register(grapheme)
160 }
161}
162
163#[derive(Debug)]
164pub struct GetOrRegisterMany<'r, 'g> {
165 registry: &'r Registry,
166 graphemes: Graphemes<'g>,
167}
168
169impl<'r, 'g> Iterator for GetOrRegisterMany<'r, 'g> {
170 type Item = Id;
171
172 fn next(&mut self) -> Option<Self::Item> {
173 let grapheme = self.graphemes.next()?;
174 Some(self.registry.get_or_register_unchecked(grapheme))
175 }
176}
177
178#[derive(Debug, Clone)]
179pub struct GraphemeChars<'r> {
180 inner: GraphemeCharsInner<'r>,
181}
182
183impl<'r> GraphemeChars<'r> {
184 fn single(ch: char) -> Self {
185 Self { inner: GraphemeCharsInner::Single(Some(ch)) }
186 }
187
188 fn multiple(content: &'r str) -> Self {
189 Self { inner: GraphemeCharsInner::Multiple(content.chars()) }
190 }
191}
192
193impl<'r> Iterator for GraphemeChars<'r> {
194 type Item = char;
195
196 fn next(&mut self) -> Option<Self::Item> {
197 match &mut self.inner {
198 GraphemeCharsInner::Single(ch) => ch.take(),
199 GraphemeCharsInner::Multiple(iter) => iter.next(),
200 }
201 }
202
203 fn size_hint(&self) -> (usize, Option<usize>) {
204 match &self.inner {
205 GraphemeCharsInner::Single(None) => (0, Some(0)),
206 GraphemeCharsInner::Single(Some(_)) => (1, Some(1)),
207 GraphemeCharsInner::Multiple(iter) => iter.size_hint(),
208 }
209 }
210}
211
212#[derive(Debug, Clone)]
213enum GraphemeCharsInner<'r> {
214 Single(Option<char>),
215 Multiple(str::Chars<'r>),
216}
217
218#[cfg(test)]
219mod test {
220 use super::{Id, Registry};
221
222 #[test]
223 fn id_from_char_is_char_ascii() {
224 let actual = Id::from('a').bits;
225 let expected = 'a' as u64;
226 assert_eq!(expected, actual);
227 }
228
229 #[test]
230 fn id_from_char_is_char_unicode() {
231 let actual = Id::from('á').bits;
232 let expected = 'á' as u64;
233 assert_eq!(expected, actual);
234 }
235
236 #[test]
237 fn register_single_char_grapheme_ascii() {
238 let registry = Registry::new();
239 let id = registry.get_or_register("a").unwrap();
240 let actual: String =
241 registry.lookup(id, |result| result.unwrap().collect());
242 let expected = "a";
243 assert_eq!(expected, actual);
244
245 let expected = "a";
246 let actual: String =
247 registry.lookup(id, |result| result.unwrap().collect());
248 assert_eq!(expected, actual);
249 }
250
251 #[test]
252 fn register_single_char_grapheme_unicode() {
253 let registry = Registry::new();
254 let id = registry.get_or_register("á").unwrap();
255 let actual: String =
256 registry.lookup(id, |result| result.unwrap().collect());
257 let expected = "á";
258 assert_eq!(expected, actual);
259
260 let expected = "á";
261 let actual: String =
262 registry.lookup(id, |result| result.unwrap().collect());
263 assert_eq!(expected, actual);
264 }
265
266 #[test]
267 fn register_single_grapheme_cluster() {
268 let registry = Registry::new();
269 let id = registry.get_or_register("b̥").unwrap();
270 let actual: String =
271 registry.lookup(id, |result| result.unwrap().collect());
272 let expected = "b̥";
273 assert_eq!(expected, actual);
274
275 let expected = "b̥";
276 let actual: String =
277 registry.lookup(id, |result| result.unwrap().collect());
278 assert_eq!(expected, actual);
279 }
280
281 #[test]
282 fn register_many() {
283 let registry = Registry::new();
284 let ids: Vec<_> = registry.get_or_register_many("ab̥á").collect();
285 let mut actual = Vec::<String>::new();
286 for &id in &ids {
287 let actual_elem =
288 registry.lookup(id, |result| result.unwrap().collect());
289 actual.push(actual_elem);
290 }
291 let expected = ["a", "b̥", "á"].map(ToOwned::to_owned);
292 assert_eq!(&expected[..], &actual[..]);
293
294 let expected = "a";
295 let actual: String =
296 registry.lookup(ids[0], |result| result.unwrap().collect());
297 assert_eq!(expected, actual);
298
299 let expected = "b̥";
300 let actual: String =
301 registry.lookup(ids[1], |result| result.unwrap().collect());
302 assert_eq!(expected, actual);
303
304 let expected = "á";
305 let actual: String =
306 registry.lookup(ids[2], |result| result.unwrap().collect());
307 assert_eq!(expected, actual);
308 }
309}