litrs/
escape.rs

Help
1use crate::{ParseError, err::{perr, ParseErrorKind::*}, parse::{hex_digit_value, check_suffix}};
2
3
4/// Must start with `\`
5pub(crate) fn unescape<E: Escapee>(input: &str, offset: usize) -> Result<(E, usize), ParseError> {
6    let first = input.as_bytes().get(1)
7        .ok_or(perr(offset, UnterminatedEscape))?;
8    let out = match first {
9        // Quote escapes
10        b'\'' => (E::from_byte(b'\''), 2),
11        b'"' => (E::from_byte(b'"'), 2),
12
13        // Ascii escapes
14        b'n' => (E::from_byte(b'\n'), 2),
15        b'r' => (E::from_byte(b'\r'), 2),
16        b't' => (E::from_byte(b'\t'), 2),
17        b'\\' => (E::from_byte(b'\\'), 2),
18        b'0' => (E::from_byte(b'\0'), 2),
19        b'x' => {
20            let hex_string = input.get(2..4)
21                .ok_or(perr(offset..offset + input.len(), UnterminatedEscape))?
22                .as_bytes();
23            let first = hex_digit_value(hex_string[0])
24                .ok_or(perr(offset..offset + 4, InvalidXEscape))?;
25            let second = hex_digit_value(hex_string[1])
26                .ok_or(perr(offset..offset + 4, InvalidXEscape))?;
27            let value = second + 16 * first;
28
29            if E::SUPPORTS_UNICODE && value > 0x7F {
30                return Err(perr(offset..offset + 4, NonAsciiXEscape));
31            }
32
33            (E::from_byte(value), 4)
34        },
35
36        // Unicode escape
37        b'u' => {
38            if !E::SUPPORTS_UNICODE {
39                return Err(perr(offset..offset + 2, UnicodeEscapeInByteLiteral));
40            }
41
42            if input.as_bytes().get(2) != Some(&b'{') {
43                return Err(perr(offset..offset + 2, UnicodeEscapeWithoutBrace));
44            }
45
46            let closing_pos = input.bytes().position(|b| b == b'}')
47                .ok_or(perr(offset..offset + input.len(), UnterminatedUnicodeEscape))?;
48
49            let inner = &input[3..closing_pos];
50            if inner.as_bytes().first() == Some(&b'_') {
51                return Err(perr(4, InvalidStartOfUnicodeEscape));
52            }
53
54            let mut v: u32 = 0;
55            let mut digit_count = 0;
56            for (i, b) in inner.bytes().enumerate() {
57                if b == b'_'{
58                    continue;
59                }
60
61                let digit = hex_digit_value(b)
62                    .ok_or(perr(offset + 3 + i, NonHexDigitInUnicodeEscape))?;
63
64                if digit_count == 6 {
65                    return Err(perr(offset + 3 + i, TooManyDigitInUnicodeEscape));
66                }
67                digit_count += 1;
68                v = 16 * v + digit as u32;
69            }
70
71            let c = std::char::from_u32(v)
72                .ok_or(perr(offset..closing_pos + 1, InvalidUnicodeEscapeChar))?;
73
74            (E::from_char(c), closing_pos + 1)
75        }
76
77        _ => return Err(perr(offset..offset + 2, UnknownEscape)),
78    };
79
80    Ok(out)
81}
82
83pub(crate) trait Escapee: Into<char> {
84    const SUPPORTS_UNICODE: bool;
85    fn from_byte(b: u8) -> Self;
86    fn from_char(c: char) -> Self;
87}
88
89impl Escapee for u8 {
90    const SUPPORTS_UNICODE: bool = false;
91    fn from_byte(b: u8) -> Self {
92        b
93    }
94    fn from_char(_: char) -> Self {
95        panic!("bug: `<u8 as Escapee>::from_char` was called");
96    }
97}
98
99impl Escapee for char {
100    const SUPPORTS_UNICODE: bool = true;
101    fn from_byte(b: u8) -> Self {
102        b.into()
103    }
104    fn from_char(c: char) -> Self {
105        c
106    }
107}
108
109/// Checks whether the character is skipped after a string continue start
110/// (unescaped backlash followed by `\n`).
111fn is_string_continue_skipable_whitespace(b: u8) -> bool {
112    b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
113}
114
115/// Unescapes a whole string or byte string.
116#[inline(never)]
117pub(crate) fn unescape_string<E: Escapee>(
118    input: &str,
119    offset: usize,
120) -> Result<(Option<String>, usize), ParseError> {
121    let mut closing_quote_pos = None;
122    let mut i = offset;
123    let mut end_last_escape = offset;
124    let mut value = String::new();
125    while i < input.len() {
126        match input.as_bytes()[i] {
127            // Handle "string continue".
128            b'\\' if input.as_bytes().get(i + 1) == Some(&b'\n') => {
129                value.push_str(&input[end_last_escape..i]);
130
131                // Find the first non-whitespace character.
132                let end_escape = input[i + 2..].bytes()
133                    .position(|b| !is_string_continue_skipable_whitespace(b))
134                    .ok_or(perr(None, UnterminatedString))?;
135
136                i += 2 + end_escape;
137                end_last_escape = i;
138            }
139            b'\\' => {
140                let (c, len) = unescape::<E>(&input[i..input.len() - 1], i)?;
141                value.push_str(&input[end_last_escape..i]);
142                value.push(c.into());
143                i += len;
144                end_last_escape = i;
145            }
146            b'\r' => {
147                if input.as_bytes().get(i + 1) == Some(&b'\n') {
148                    value.push_str(&input[end_last_escape..i]);
149                    value.push('\n');
150                    i += 2;
151                    end_last_escape = i;
152                } else {
153                    return Err(perr(i, IsolatedCr))
154                }
155            }
156            b'"' => {
157                closing_quote_pos = Some(i);
158                break;
159            },
160            b if !E::SUPPORTS_UNICODE && !b.is_ascii()
161                => return Err(perr(i, NonAsciiInByteLiteral)),
162            _ => i += 1,
163        }
164    }
165
166    let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedString))?;
167
168    let start_suffix = closing_quote_pos + 1;
169    let suffix = &input[start_suffix..];
170    check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?;
171
172    // `value` is only empty if there was no escape in the input string
173    // (with the special case of the input being empty). This means the
174    // string value basically equals the input, so we store `None`.
175    let value = if value.is_empty() {
176        None
177    } else {
178        // There was an escape in the string, so we need to push the
179        // remaining unescaped part of the string still.
180        value.push_str(&input[end_last_escape..closing_quote_pos]);
181        Some(value)
182    };
183
184    Ok((value, start_suffix))
185}
186
187/// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to
188/// just `\n` sequences. Returns an optional new string (if the input contained
189/// any `\r\n`) and the number of hashes used by the literal.
190#[inline(never)]
191pub(crate) fn scan_raw_string<E: Escapee>(
192    input: &str,
193    offset: usize,
194) -> Result<(Option<String>, u32, usize), ParseError> {
195    // Raw string literal
196    let num_hashes = input[offset..].bytes().position(|b| b != b'#')
197        .ok_or(perr(None, InvalidLiteral))?;
198
199    if input.as_bytes().get(offset + num_hashes) != Some(&b'"') {
200        return Err(perr(None, InvalidLiteral));
201    }
202    let start_inner = offset + num_hashes + 1;
203    let hashes = &input[offset..num_hashes + offset];
204
205    let mut closing_quote_pos = None;
206    let mut i = start_inner;
207    let mut end_last_escape = start_inner;
208    let mut value = String::new();
209    while i < input.len() {
210        let b = input.as_bytes()[i];
211        if b == b'"' && input[i + 1..].starts_with(hashes) {
212            closing_quote_pos = Some(i);
213            break;
214        }
215
216        if b == b'\r' {
217            // Convert `\r\n` into `\n`. This is currently not well documented
218            // in the Rust reference, but is done even for raw strings. That's
219            // because rustc simply converts all line endings when reading
220            // source files.
221            if input.as_bytes().get(i + 1) == Some(&b'\n') {
222                value.push_str(&input[end_last_escape..i]);
223                value.push('\n');
224                i += 2;
225                end_last_escape = i;
226                continue;
227            } else if E::SUPPORTS_UNICODE {
228                // If no \n follows the \r and we are scanning a raw string
229                // (not raw byte string), we error.
230                return Err(perr(i, IsolatedCr))
231            }
232        }
233
234        if !E::SUPPORTS_UNICODE {
235            if !b.is_ascii() {
236                return Err(perr(i, NonAsciiInByteLiteral));
237            }
238        }
239
240        i += 1;
241    }
242
243    let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedRawString))?;
244
245    let start_suffix = closing_quote_pos + num_hashes + 1;
246    let suffix = &input[start_suffix..];
247    check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?;
248
249    // `value` is only empty if there was no \r\n in the input string (with the
250    // special case of the input being empty). This means the string value
251    // equals the input, so we store `None`.
252    let value = if value.is_empty() {
253        None
254    } else {
255        // There was an \r\n in the string, so we need to push the remaining
256        // unescaped part of the string still.
257        value.push_str(&input[end_last_escape..closing_quote_pos]);
258        Some(value)
259    };
260
261    Ok((value, num_hashes as u32, start_suffix))
262}
litrs/escape.rs

litrs/
escape.rs