icu_normalizer/
uts46.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Bundles the part of UTS 46 that makes sense to implement as a
6//! normalization.
7//!
8//! This is meant to be used as a building block of an UTS 46
9//! implementation, such as the `idna` crate.
10
11use crate::ComposingNormalizer;
12use crate::ComposingNormalizerBorrowed;
13use crate::NormalizerNfcV1;
14use crate::NormalizerNfdTablesV1;
15use crate::NormalizerNfkdTablesV1;
16use crate::NormalizerUts46DataV1;
17use icu_provider::DataError;
18use icu_provider::DataProvider;
19
20// Implementation note: Despite merely wrapping a `ComposingNormalizer`,
21// having a `Uts46Mapper` serves two purposes:
22//
23// 1. Denying public access to parts of the `ComposingNormalizer` API
24//    that don't work when the data contains markers for ignorables.
25// 2. Providing a place where additional iterator pre-processing or
26//    post-processing can take place if needed in the future. (When
27//    writing this, it looked like such processing was needed but
28//    now isn't needed after all.)
29
30/// A borrowed version of a mapper that knows how to performs the
31/// subsets of UTS 46 processing documented on the methods.
32#[derive(Debug)]
33pub struct Uts46MapperBorrowed<'a> {
34    normalizer: ComposingNormalizerBorrowed<'a>,
35}
36
37#[cfg(feature = "compiled_data")]
38impl Default for Uts46MapperBorrowed<'static> {
39    fn default() -> Self {
40        Self::new()
41    }
42}
43
44impl Uts46MapperBorrowed<'static> {
45    /// Cheaply converts a [`Uts46MapperBorrowed<'static>`] into a [`Uts46Mapper`].
46    ///
47    /// Note: Due to branching and indirection, using [`Uts46Mapper`] might inhibit some
48    /// compile-time optimizations that are possible with [`Uts46MapperBorrowed`].
49    pub const fn static_to_owned(self) -> Uts46Mapper {
50        Uts46Mapper {
51            normalizer: self.normalizer.static_to_owned(),
52        }
53    }
54
55    /// Construct with compiled data.
56    #[cfg(feature = "compiled_data")]
57    pub const fn new() -> Self {
58        Uts46MapperBorrowed {
59            normalizer: ComposingNormalizerBorrowed::new_uts46(),
60        }
61    }
62}
63
64impl Uts46MapperBorrowed<'_> {
65    /// Returns an iterator adaptor that turns an `Iterator` over `char`
66    /// into an iterator yielding a `char` sequence that gets the following
67    /// operations from the "Map" and "Normalize" steps of the "Processing"
68    /// section of UTS 46 lazily applied to it:
69    ///
70    /// 1. The _ignored_ characters are ignored.
71    /// 2. The _mapped_ characters are mapped.
72    /// 3. The _disallowed_ characters are replaced with U+FFFD,
73    ///    which itself is a disallowed character.
74    /// 4. The _deviation_ characters are treated as _mapped_ or _valid_
75    ///    as appropriate.
76    /// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
77    /// 6. The _disallowed_STD3_mapped_ characters are treated as
78    ///    _mapped_.
79    /// 7. The result is normalized to NFC.
80    ///
81    /// Notably:
82    ///
83    /// * The STD3 or WHATWG ASCII deny list should be implemented as a
84    ///   post-processing step.
85    /// * Transitional processing is not performed. Transitional mapping
86    ///   would be a pre-processing step, but transitional processing is
87    ///   deprecated, and none of Firefox, Safari, or Chrome use it.
88    pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>(
89        &'delegate self,
90        iter: I,
91    ) -> impl Iterator<Item = char> + 'delegate {
92        self.normalizer
93            .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored)
94    }
95
96    /// Returns an iterator adaptor that turns an `Iterator` over `char`
97    /// into an iterator yielding a `char` sequence that gets the following
98    /// operations from the NFC check and statucs steps of the "Validity
99    /// Criteria" section of UTS 46 lazily applied to it:
100    ///
101    /// 1. The _ignored_ characters are treated as _disallowed_.
102    /// 2. The _mapped_ characters are mapped.
103    /// 3. The _disallowed_ characters are replaced with U+FFFD,
104    ///    which itself is a disallowed character.
105    /// 4. The _deviation_ characters are treated as _mapped_ or _valid_
106    ///    as appropriate.
107    /// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
108    /// 6. The _disallowed_STD3_mapped_ characters are treated as
109    ///    _mapped_.
110    /// 7. The result is normalized to NFC.
111    ///
112    /// Notably:
113    ///
114    /// * The STD3 or WHATWG ASCII deny list should be implemented as a
115    ///   post-processing step.
116    /// * Transitional processing is not performed. Transitional mapping
117    ///   would be a pre-processing step, but transitional processing is
118    ///   deprecated, and none of Firefox, Safari, or Chrome use it.
119    /// * The output needs to be compared with input to see if anything
120    ///   changed. This check catches failures to adhere to the normalization
121    ///   and status requirements. In particular, this comparison results
122    ///   in _mapped_ characters resulting in error like "Validity Criteria"
123    ///   requires.
124    pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>(
125        &'delegate self,
126        iter: I,
127    ) -> impl Iterator<Item = char> + 'delegate {
128        self.normalizer
129            .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter)
130    }
131}
132
133/// A mapper that knows how to performs the subsets of UTS 46 processing
134/// documented on the methods.
135#[derive(Debug)]
136pub struct Uts46Mapper {
137    normalizer: ComposingNormalizer,
138}
139
140#[cfg(feature = "compiled_data")]
141impl Default for Uts46Mapper {
142    fn default() -> Self {
143        Self::new().static_to_owned()
144    }
145}
146
147impl Uts46Mapper {
148    /// Constructs a borrowed version of this type for more efficient querying.
149    pub fn as_borrowed(&self) -> Uts46MapperBorrowed<'_> {
150        Uts46MapperBorrowed {
151            normalizer: self.normalizer.as_borrowed(),
152        }
153    }
154
155    /// Construct with compiled data.
156    #[cfg(feature = "compiled_data")]
157    #[allow(clippy::new_ret_no_self)]
158    pub const fn new() -> Uts46MapperBorrowed<'static> {
159        Uts46MapperBorrowed::new()
160    }
161
162    /// Construct with provider.
163    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
164    pub fn try_new<D>(provider: &D) -> Result<Self, DataError>
165    where
166        D: DataProvider<NormalizerUts46DataV1>
167            + DataProvider<NormalizerNfdTablesV1>
168            + DataProvider<NormalizerNfkdTablesV1>
169            // UTS 46 tables merged into NormalizerNfkdTablesV1
170            + DataProvider<NormalizerNfcV1>
171            + ?Sized,
172    {
173        let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?;
174
175        Ok(Uts46Mapper { normalizer })
176    }
177}