Skip to main content

thedes_tui_core/
grapheme.rs

1use std::{collections::HashMap, fmt, str, sync::Arc};
2
3use thiserror::Error;
4use unicode_segmentation::{Graphemes, UnicodeSegmentation};
5
6type Grapheme = Box<str>;
7
8#[derive(Debug, Error)]
9#[error("Input {input} is not a grapheme")]
10pub struct NotGrapheme {
11    pub input: String,
12}
13
14#[derive(Debug, Error)]
15#[error("Grapheme id {id} is unknwon")]
16pub struct UnknownId {
17    pub id: Id,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
21pub struct Id {
22    bits: u64,
23}
24
25impl Id {
26    fn from_index(index: usize) -> Self {
27        let max_code = u64::from(char::MAX);
28        let bits = u64::try_from(index)
29            .ok()
30            .and_then(|bits| bits.checked_add(max_code + 1))
31            .expect("index could not be so large");
32        Self { bits }
33    }
34}
35
36impl From<char> for Id {
37    fn from(value: char) -> Self {
38        Self { bits: u64::from(value) }
39    }
40}
41
42impl fmt::Display for Id {
43    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
44        write!(f, "{}", self.bits)
45    }
46}
47
48impl PartialEq<char> for Id {
49    fn eq(&self, other: &char) -> bool {
50        *self == Self::from(*other)
51    }
52}
53
54impl PartialEq<Id> for char {
55    fn eq(&self, other: &Id) -> bool {
56        Id::from(*self) == *other
57    }
58}
59
60#[derive(Debug)]
61struct RegistryInner {
62    index_to_string: Vec<Grapheme>,
63    string_to_id: HashMap<Grapheme, Id>,
64}
65
66impl RegistryInner {
67    pub fn new() -> Self {
68        Self { index_to_string: Vec::new(), string_to_id: HashMap::new() }
69    }
70
71    pub fn index_to_string(&self, index: usize) -> Option<&str> {
72        self.index_to_string.get(index).map(AsRef::as_ref)
73    }
74
75    pub fn get_or_register(&mut self, grapheme: &str) -> Id {
76        match self.string_to_id.get(grapheme) {
77            Some(id) => *id,
78            None => self.register(grapheme),
79        }
80    }
81
82    pub fn register(&mut self, grapheme: &str) -> Id {
83        let index = self.index_to_string.len();
84        let id = Id::from_index(index);
85        self.index_to_string.push(grapheme.into());
86        self.string_to_id.insert(grapheme.into(), id);
87        id
88    }
89}
90
91#[derive(Debug, Clone)]
92pub struct Registry {
93    inner: Arc<std::sync::Mutex<RegistryInner>>,
94}
95
96impl Registry {
97    pub fn new() -> Self {
98        Self { inner: Arc::new(std::sync::Mutex::new(RegistryInner::new())) }
99    }
100
101    pub fn get_or_register_many<'r, 'g>(
102        &'r self,
103        graphemes: &'g str,
104    ) -> GetOrRegisterMany<'r, 'g> {
105        GetOrRegisterMany {
106            registry: self,
107            graphemes: graphemes.graphemes(true),
108        }
109    }
110
111    pub fn len_of(&self, graphemes: &str) -> usize {
112        graphemes.graphemes(true).count()
113    }
114
115    pub fn get_or_register(&self, grapheme: &str) -> Result<Id, NotGrapheme> {
116        let mut iter = grapheme.graphemes(true);
117        if iter.next().is_none() {
118            Err(NotGrapheme { input: grapheme.into() })?;
119        }
120        if iter.next().is_some() {
121            Err(NotGrapheme { input: grapheme.into() })?;
122        }
123        Ok(self.get_or_register_unchecked(grapheme))
124    }
125
126    pub fn lookup<F, T>(&self, id: Id, scope: F) -> T
127    where
128        F: FnOnce(Result<GraphemeChars<'_>, UnknownId>) -> T,
129    {
130        let max_char = u64::from(char::MAX);
131        if let Some(bits) = id.bits.checked_sub(max_char + 1) {
132            let index = usize::try_from(bits)
133                .expect("id bits should have been constructed from index");
134            let inner = self.inner.lock().expect("poisoned lock");
135            let result = inner
136                .index_to_string(index)
137                .map(GraphemeChars::multiple)
138                .ok_or(UnknownId { id });
139            scope(result)
140        } else {
141            let code = u32::try_from(id.bits)
142                .ok()
143                .and_then(char::from_u32)
144                .expect("already checked for char range");
145            let result = Ok(GraphemeChars::single(code));
146            scope(result)
147        }
148    }
149
150    fn get_or_register_unchecked(&self, grapheme: &str) -> Id {
151        let mut chars = grapheme.chars();
152        if let Some(ch) = chars.next() {
153            if chars.next().is_none() {
154                return Id::from(ch);
155            }
156        }
157
158        let mut inner = self.inner.lock().expect("poisoned lock");
159        inner.get_or_register(grapheme)
160    }
161}
162
163#[derive(Debug)]
164pub struct GetOrRegisterMany<'r, 'g> {
165    registry: &'r Registry,
166    graphemes: Graphemes<'g>,
167}
168
169impl<'r, 'g> Iterator for GetOrRegisterMany<'r, 'g> {
170    type Item = Id;
171
172    fn next(&mut self) -> Option<Self::Item> {
173        let grapheme = self.graphemes.next()?;
174        Some(self.registry.get_or_register_unchecked(grapheme))
175    }
176}
177
178#[derive(Debug, Clone)]
179pub struct GraphemeChars<'r> {
180    inner: GraphemeCharsInner<'r>,
181}
182
183impl<'r> GraphemeChars<'r> {
184    fn single(ch: char) -> Self {
185        Self { inner: GraphemeCharsInner::Single(Some(ch)) }
186    }
187
188    fn multiple(content: &'r str) -> Self {
189        Self { inner: GraphemeCharsInner::Multiple(content.chars()) }
190    }
191}
192
193impl<'r> Iterator for GraphemeChars<'r> {
194    type Item = char;
195
196    fn next(&mut self) -> Option<Self::Item> {
197        match &mut self.inner {
198            GraphemeCharsInner::Single(ch) => ch.take(),
199            GraphemeCharsInner::Multiple(iter) => iter.next(),
200        }
201    }
202
203    fn size_hint(&self) -> (usize, Option<usize>) {
204        match &self.inner {
205            GraphemeCharsInner::Single(None) => (0, Some(0)),
206            GraphemeCharsInner::Single(Some(_)) => (1, Some(1)),
207            GraphemeCharsInner::Multiple(iter) => iter.size_hint(),
208        }
209    }
210}
211
212#[derive(Debug, Clone)]
213enum GraphemeCharsInner<'r> {
214    Single(Option<char>),
215    Multiple(str::Chars<'r>),
216}
217
218#[cfg(test)]
219mod test {
220    use super::{Id, Registry};
221
222    #[test]
223    fn id_from_char_is_char_ascii() {
224        let actual = Id::from('a').bits;
225        let expected = 'a' as u64;
226        assert_eq!(expected, actual);
227    }
228
229    #[test]
230    fn id_from_char_is_char_unicode() {
231        let actual = Id::from('á').bits;
232        let expected = 'á' as u64;
233        assert_eq!(expected, actual);
234    }
235
236    #[test]
237    fn register_single_char_grapheme_ascii() {
238        let registry = Registry::new();
239        let id = registry.get_or_register("a").unwrap();
240        let actual: String =
241            registry.lookup(id, |result| result.unwrap().collect());
242        let expected = "a";
243        assert_eq!(expected, actual);
244
245        let expected = "a";
246        let actual: String =
247            registry.lookup(id, |result| result.unwrap().collect());
248        assert_eq!(expected, actual);
249    }
250
251    #[test]
252    fn register_single_char_grapheme_unicode() {
253        let registry = Registry::new();
254        let id = registry.get_or_register("á").unwrap();
255        let actual: String =
256            registry.lookup(id, |result| result.unwrap().collect());
257        let expected = "á";
258        assert_eq!(expected, actual);
259
260        let expected = "á";
261        let actual: String =
262            registry.lookup(id, |result| result.unwrap().collect());
263        assert_eq!(expected, actual);
264    }
265
266    #[test]
267    fn register_single_grapheme_cluster() {
268        let registry = Registry::new();
269        let id = registry.get_or_register("b̥").unwrap();
270        let actual: String =
271            registry.lookup(id, |result| result.unwrap().collect());
272        let expected = "b̥";
273        assert_eq!(expected, actual);
274
275        let expected = "b̥";
276        let actual: String =
277            registry.lookup(id, |result| result.unwrap().collect());
278        assert_eq!(expected, actual);
279    }
280
281    #[test]
282    fn register_many() {
283        let registry = Registry::new();
284        let ids: Vec<_> = registry.get_or_register_many("ab̥á").collect();
285        let mut actual = Vec::<String>::new();
286        for &id in &ids {
287            let actual_elem =
288                registry.lookup(id, |result| result.unwrap().collect());
289            actual.push(actual_elem);
290        }
291        let expected = ["a", "b̥", "á"].map(ToOwned::to_owned);
292        assert_eq!(&expected[..], &actual[..]);
293
294        let expected = "a";
295        let actual: String =
296            registry.lookup(ids[0], |result| result.unwrap().collect());
297        assert_eq!(expected, actual);
298
299        let expected = "b̥";
300        let actual: String =
301            registry.lookup(ids[1], |result| result.unwrap().collect());
302        assert_eq!(expected, actual);
303
304        let expected = "á";
305        let actual: String =
306            registry.lookup(ids[2], |result| result.unwrap().collect());
307        assert_eq!(expected, actual);
308    }
309}