cexpr/
literal.rs

1// (C) Copyright 2016 Jethro G. Beekman
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8//! Parsing C literals from byte slices.
9//!
10//! This will parse a representation of a C literal into a Rust type.
11//!
12//! # characters
13//! Character literals are stored into the `CChar` type, which can hold values
14//! that are not valid Unicode code points. ASCII characters are represented as
15//! `char`, literal bytes with the high byte set are converted into the raw
16//! representation. Escape sequences are supported. If hex and octal escapes
17//! map to an ASCII character, that is used, otherwise, the raw encoding is
18//! used, including for values over 255. Unicode escapes are checked for
19//! validity and mapped to `char`. Character sequences are not supported. Width
20//! prefixes are ignored.
21//!
22//! # strings
23//! Strings are interpreted as byte vectors. Escape sequences are supported. If
24//! hex and octal escapes map onto multi-byte characters, they are truncated to
25//! one 8-bit character. Unicode escapes are converted into their UTF-8
26//! encoding. Width prefixes are ignored.
27//!
28//! # integers
29//! Integers are read into `i64`. Binary, octal, decimal and hexadecimal are
30//! all supported. If the literal value is between `i64::MAX` and `u64::MAX`,
31//! it is bit-cast to `i64`. Values over `u64::MAX` cannot be parsed. Width and
32//! sign suffixes are ignored. Sign prefixes are not supported.
33//!
34//! # real numbers
35//! Reals are read into `f64`. Width suffixes are ignored. Sign prefixes are
36//! not supported in the significand. Hexadecimal floating points are not
37//! supported.
38
39use std::char;
40use std::str::{self, FromStr};
41
42use nom::branch::alt;
43use nom::bytes::complete::is_not;
44use nom::bytes::complete::tag;
45use nom::character::complete::{char, one_of};
46use nom::combinator::{complete, map, map_opt, opt, recognize};
47use nom::multi::{fold_many0, many0, many1, many_m_n};
48use nom::sequence::{delimited, pair, preceded, terminated, tuple};
49use nom::*;
50
51use crate::expr::EvalResult;
52use crate::ToCexprResult;
53
54#[derive(Debug, Copy, Clone, PartialEq, Eq)]
55/// Representation of a C character
56pub enum CChar {
57    /// A character that can be represented as a `char`
58    Char(char),
59    /// Any other character (8-bit characters, unicode surrogates, etc.)
60    Raw(u64),
61}
62
63impl From<u8> for CChar {
64    fn from(i: u8) -> CChar {
65        match i {
66            0..=0x7f => CChar::Char(i as u8 as char),
67            _ => CChar::Raw(i as u64),
68        }
69    }
70}
71
72// A non-allocating version of this would be nice...
73impl std::convert::Into<Vec<u8>> for CChar {
74    fn into(self) -> Vec<u8> {
75        match self {
76            CChar::Char(c) => {
77                let mut s = String::with_capacity(4);
78                s.extend(&[c]);
79                s.into_bytes()
80            }
81            CChar::Raw(i) => {
82                let mut v = Vec::with_capacity(1);
83                v.push(i as u8);
84                v
85            }
86        }
87    }
88}
89
90/// ensures the child parser consumes the whole input
91pub fn full<I: Clone, O, F>(
92    f: F,
93) -> impl Fn(I) -> nom::IResult<I, O>
94where
95    I: nom::InputLength,
96    F: Fn(I) -> nom::IResult<I, O>,
97{
98    move |input| {
99        let res = f(input);
100        match res {
101            Ok((i, o)) => {
102                if i.input_len() == 0 {
103                    Ok((i, o))
104                } else {
105                    Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::Complete)))
106                }
107            }
108            r => r,
109        }
110    }
111}
112
113// =================================
114// ======== matching digits ========
115// =================================
116
117macro_rules! byte {
118	($($p: pat)|* ) => {{
119        fn parser(i: &[u8]) -> crate::nom::IResult<&[u8], u8> {
120            match i.split_first() {
121                $(Some((&c @ $p,rest)))|* => Ok((rest,c)),
122                Some(_) => Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::OneOf))),
123                None => Err(nom::Err::Incomplete(Needed::new(1))),
124            }
125        }
126
127        parser
128	}}
129}
130
131fn binary(i: &[u8]) -> nom::IResult<&[u8], u8> {
132    byte!(b'0'..=b'1')(i)
133}
134
135fn octal(i: &[u8]) -> nom::IResult<&[u8], u8> {
136    byte!(b'0'..=b'7')(i)
137}
138
139fn decimal(i: &[u8]) -> nom::IResult<&[u8], u8> {
140    byte!(b'0'..=b'9')(i)
141}
142
143fn hexadecimal(i: &[u8]) -> nom::IResult<&[u8], u8> {
144    byte!(b'0' ..= b'9' | b'a' ..= b'f' | b'A' ..= b'F')(i)
145}
146
147// ========================================
148// ======== characters and strings ========
149// ========================================
150
151fn escape2char(c: char) -> CChar {
152    CChar::Char(match c {
153        'a' => '\x07',
154        'b' => '\x08',
155        'f' => '\x0c',
156        'n' => '\n',
157        'r' => '\r',
158        't' => '\t',
159        'v' => '\x0b',
160        _ => unreachable!("invalid escape {}", c),
161    })
162}
163
164fn c_raw_escape(n: Vec<u8>, radix: u32) -> Option<CChar> {
165    str::from_utf8(&n)
166        .ok()
167        .and_then(|i| u64::from_str_radix(i, radix).ok())
168        .map(|i| match i {
169            0..=0x7f => CChar::Char(i as u8 as char),
170            _ => CChar::Raw(i),
171        })
172}
173
174fn c_unicode_escape(n: Vec<u8>) -> Option<CChar> {
175    str::from_utf8(&n)
176        .ok()
177        .and_then(|i| u32::from_str_radix(i, 16).ok())
178        .and_then(char::from_u32)
179        .map(CChar::Char)
180}
181
182fn escaped_char(i: &[u8]) -> nom::IResult<&[u8], CChar> {
183    preceded(
184        char('\\'),
185        alt((
186            map(one_of(r#"'"?\"#), CChar::Char),
187            map(one_of("abfnrtv"), escape2char),
188            map_opt(many_m_n(1, 3, octal), |v| c_raw_escape(v, 8)),
189            map_opt(preceded(char('x'), many1(hexadecimal)), |v| {
190                c_raw_escape(v, 16)
191            }),
192            map_opt(
193                preceded(char('u'), many_m_n(4, 4, hexadecimal)),
194                c_unicode_escape,
195            ),
196            map_opt(
197                preceded(char('U'), many_m_n(8, 8, hexadecimal)),
198                c_unicode_escape,
199            ),
200        )),
201    )(i)
202}
203
204fn c_width_prefix(i: &[u8]) -> nom::IResult<&[u8], &[u8]> {
205    alt((tag("u8"), tag("u"), tag("U"), tag("L")))(i)
206}
207
208fn c_char(i: &[u8]) -> nom::IResult<&[u8], CChar> {
209    delimited(
210        terminated(opt(c_width_prefix), char('\'')),
211        alt((
212            escaped_char,
213            map(byte!(0 ..= 91 /* \=92 */ | 93 ..= 255), CChar::from),
214        )),
215        char('\''),
216    )(i)
217}
218
219fn c_string(i: &[u8]) -> nom::IResult<&[u8], Vec<u8>> {
220    delimited(
221        alt((preceded(c_width_prefix, char('"')), char('"'))),
222        fold_many0(
223            alt((
224                map(escaped_char, |c: CChar| c.into()),
225                map(is_not([b'\\', b'"']), |c: &[u8]| c.into()),
226            )),
227            Vec::new,
228            |mut v: Vec<u8>, res: Vec<u8>| {
229                v.extend_from_slice(&res);
230                v
231            },
232        ),
233        char('"'),
234    )(i)
235}
236
237// ================================
238// ======== parse integers ========
239// ================================
240
241fn c_int_radix(n: Vec<u8>, radix: u32) -> Option<u64> {
242    str::from_utf8(&n)
243        .ok()
244        .and_then(|i| u64::from_str_radix(i, radix).ok())
245}
246
247fn take_ul(input: &[u8]) -> IResult<&[u8], &[u8]> {
248    let r = input.split_at_position(|c| c != b'u' && c != b'U' && c != b'l' && c != b'L');
249    match r {
250        Err(Err::Incomplete(_)) => Ok((&input[input.len()..], input)),
251        res => res,
252    }
253}
254
255fn c_int(i: &[u8]) -> nom::IResult<&[u8], i64> {
256    map(
257        terminated(
258            alt((
259                map_opt(preceded(tag("0x"), many1(complete(hexadecimal))), |v| {
260                    c_int_radix(v, 16)
261                }),
262                map_opt(preceded(tag("0X"), many1(complete(hexadecimal))), |v| {
263                    c_int_radix(v, 16)
264                }),
265                map_opt(preceded(tag("0b"), many1(complete(binary))), |v| {
266                    c_int_radix(v, 2)
267                }),
268                map_opt(preceded(tag("0B"), many1(complete(binary))), |v| {
269                    c_int_radix(v, 2)
270                }),
271                map_opt(preceded(char('0'), many1(complete(octal))), |v| {
272                    c_int_radix(v, 8)
273                }),
274                map_opt(many1(complete(decimal)), |v| c_int_radix(v, 10)),
275                |input| Err(crate::nom::Err::Error(nom::error::Error::new(input, crate::nom::ErrorKind::Fix))),
276            )),
277            opt(take_ul),
278        ),
279        |i| i as i64,
280    )(i)
281}
282
283// ==============================
284// ======== parse floats ========
285// ==============================
286
287fn float_width(i: &[u8]) -> nom::IResult<&[u8], u8> {
288    nom::combinator::complete(byte!(b'f' | b'l' | b'F' | b'L'))(i)
289}
290
291fn float_exp(i: &[u8]) -> nom::IResult<&[u8], (Option<u8>, Vec<u8>)> {
292    preceded(
293        byte!(b'e' | b'E'),
294        pair(opt(byte!(b'-' | b'+')), many1(complete(decimal))),
295    )(i)
296}
297
298fn c_float(i: &[u8]) -> nom::IResult<&[u8], f64> {
299    map_opt(
300        alt((
301            terminated(
302                recognize(tuple((
303                    many1(complete(decimal)),
304                    byte!(b'.'),
305                    many0(complete(decimal)),
306                ))),
307                opt(float_width),
308            ),
309            terminated(
310                recognize(tuple((
311                    many0(complete(decimal)),
312                    byte!(b'.'),
313                    many1(complete(decimal)),
314                ))),
315                opt(float_width),
316            ),
317            terminated(
318                recognize(tuple((
319                    many0(complete(decimal)),
320                    opt(byte!(b'.')),
321                    many1(complete(decimal)),
322                    float_exp,
323                ))),
324                opt(float_width),
325            ),
326            terminated(
327                recognize(tuple((
328                    many1(complete(decimal)),
329                    opt(byte!(b'.')),
330                    many0(complete(decimal)),
331                    float_exp,
332                ))),
333                opt(float_width),
334            ),
335            terminated(recognize(many1(complete(decimal))), float_width),
336        )),
337        |v| str::from_utf8(v).ok().and_then(|i| f64::from_str(i).ok()),
338    )(i)
339}
340
341// ================================
342// ======== main interface ========
343// ================================
344
345fn one_literal(input: &[u8]) -> nom::IResult<&[u8], EvalResult, crate::Error<&[u8]>> {
346    alt((
347        map(full(c_char), EvalResult::Char),
348        map(full(c_int), |i| EvalResult::Int(::std::num::Wrapping(i))),
349        map(full(c_float), EvalResult::Float),
350        map(full(c_string), EvalResult::Str),
351    ))(input)
352    .to_cexpr_result()
353}
354
355/// Parse a C literal.
356///
357/// The input must contain exactly the representation of a single literal
358/// token, and in particular no whitespace or sign prefixes.
359pub fn parse(input: &[u8]) -> IResult<&[u8], EvalResult, crate::Error<&[u8]>> {
360    crate::assert_full_parse(one_literal(input))
361}