roxmltree/
tokenizer.rs

Help
1use core::ops::Range;
2use core::str;
3
4use crate::{Error, TextPos};
5
6type Result<T> = core::result::Result<T, Error>;
7
8/// Extension methods for XML-subset only operations.
9trait XmlCharExt {
10    /// Checks if the value is within the
11    /// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range.
12    fn is_xml_name_start(&self) -> bool;
13
14    /// Checks if the value is within the
15    /// [NameChar](https://www.w3.org/TR/xml/#NT-NameChar) range.
16    fn is_xml_name(&self) -> bool;
17
18    /// Checks if the value is within the
19    /// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
20    fn is_xml_char(&self) -> bool;
21}
22
23impl XmlCharExt for char {
24    #[inline]
25    fn is_xml_name_start(&self) -> bool {
26        // Check for ASCII first.
27        if *self as u32 <= 128 {
28            return matches!(*self as u8, b'A'..=b'Z' | b'a'..=b'z' | b':' | b'_');
29        }
30
31        matches!(*self as u32,
32            0x0000C0..=0x0000D6
33            | 0x0000D8..=0x0000F6
34            | 0x0000F8..=0x0002FF
35            | 0x000370..=0x00037D
36            | 0x00037F..=0x001FFF
37            | 0x00200C..=0x00200D
38            | 0x002070..=0x00218F
39            | 0x002C00..=0x002FEF
40            | 0x003001..=0x00D7FF
41            | 0x00F900..=0x00FDCF
42            | 0x00FDF0..=0x00FFFD
43            | 0x010000..=0x0EFFFF)
44    }
45
46    #[inline]
47    fn is_xml_name(&self) -> bool {
48        // Check for ASCII first.
49        if *self as u32 <= 128 {
50            return (*self as u8).is_xml_name();
51        }
52
53        matches!(*self as u32, 0x0000B7
54                | 0x0000C0..=0x0000D6
55                | 0x0000D8..=0x0000F6
56                | 0x0000F8..=0x0002FF
57                | 0x000300..=0x00036F
58                | 0x000370..=0x00037D
59                | 0x00037F..=0x001FFF
60                | 0x00200C..=0x00200D
61                | 0x00203F..=0x002040
62                | 0x002070..=0x00218F
63                | 0x002C00..=0x002FEF
64                | 0x003001..=0x00D7FF
65                | 0x00F900..=0x00FDCF
66                | 0x00FDF0..=0x00FFFD
67                | 0x010000..=0x0EFFFF)
68    }
69
70    #[inline]
71    fn is_xml_char(&self) -> bool {
72        // Does not check for surrogate code points U+D800-U+DFFF,
73        // since that check was performed by Rust when the `&str` was constructed.
74        if (*self as u32) < 0x20 {
75            return (*self as u8).is_xml_space();
76        }
77
78        !matches!(*self as u32, 0xFFFF | 0xFFFE)
79    }
80}
81
82trait XmlByteExt {
83    /// Checks if byte is a space.
84    ///
85    /// `[ \r\n\t]`
86    fn is_xml_space(&self) -> bool;
87
88    /// Checks if byte is within the ASCII
89    /// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
90    fn is_xml_name(&self) -> bool;
91}
92
93impl XmlByteExt for u8 {
94    #[inline]
95    fn is_xml_space(&self) -> bool {
96        matches!(*self, b' ' | b'\t' | b'\n' | b'\r')
97    }
98
99    #[inline]
100    fn is_xml_name(&self) -> bool {
101        matches!(*self, b'A'..=b'Z' | b'a'..=b'z'| b'0'..=b'9'| b':' | b'_' | b'-' | b'.')
102    }
103}
104
105/// A string slice.
106///
107/// Like `&str`, but also contains the position in the input XML
108/// from which it was parsed.
109#[must_use]
110#[derive(Clone, Copy)]
111pub struct StrSpan<'input> {
112    text: &'input str,
113    start: usize,
114}
115
116impl<'input> From<&'input str> for StrSpan<'input> {
117    #[inline]
118    fn from(text: &'input str) -> Self {
119        StrSpan { text, start: 0 }
120    }
121}
122
123impl<'input> StrSpan<'input> {
124    #[inline]
125    pub fn from_substr(text: &str, start: usize, end: usize) -> StrSpan {
126        debug_assert!(start <= end);
127        StrSpan {
128            text: &text[start..end],
129            start,
130        }
131    }
132
133    #[inline]
134    pub fn range(&self) -> Range<usize> {
135        self.start..(self.start + self.text.len())
136    }
137
138    #[inline]
139    pub fn as_str(&self) -> &'input str {
140        self.text
141    }
142
143    #[inline]
144    fn slice_region(&self, start: usize, end: usize) -> &'input str {
145        &self.text[start..end]
146    }
147}
148
149pub enum Token<'input> {
150    // <?target content?>
151    ProcessingInstruction(&'input str, Option<&'input str>, Range<usize>),
152
153    // <!-- text -->
154    Comment(&'input str, Range<usize>),
155
156    // <!ENTITY ns_extend "http://test.com">
157    EntityDeclaration(&'input str, StrSpan<'input>),
158
159    // <ns:elem
160    ElementStart(&'input str, &'input str, usize),
161
162    // ns:attr="value"
163    Attribute(Range<usize>, u16, u8, &'input str, &'input str, StrSpan<'input>),
164
165    ElementEnd(ElementEnd<'input>, Range<usize>),
166
167    // Contains text between elements including whitespaces.
168    // Basically everything between `>` and `<`.
169    // Except `]]>`, which is not allowed and will lead to an error.
170    Text(&'input str, Range<usize>),
171
172    // <![CDATA[text]]>
173    Cdata(&'input str, Range<usize>),
174}
175
176/// `ElementEnd` token.
177#[derive(Clone, Copy)]
178pub enum ElementEnd<'input> {
179    /// Indicates `>`
180    Open,
181    /// Indicates `</ns:name>`
182    Close(&'input str, &'input str),
183    /// Indicates `/>`
184    Empty,
185}
186
187pub trait XmlEvents<'input> {
188    fn token(&mut self, token: Token<'input>) -> Result<()>;
189}
190
191// document ::= prolog element Misc*
192pub fn parse<'input>(
193    text: &'input str,
194    allow_dtd: bool,
195    events: &mut dyn XmlEvents<'input>,
196) -> Result<()> {
197    let s = &mut Stream::new(text);
198
199    // Skip UTF-8 BOM.
200    if s.starts_with(&[0xEF, 0xBB, 0xBF]) {
201        s.advance(3);
202    }
203
204    if s.starts_with(b"<?xml ") {
205        parse_declaration(s)?;
206    }
207
208    parse_misc(s, events)?;
209
210    s.skip_spaces();
211    if s.starts_with(b"<!DOCTYPE") {
212        if !allow_dtd {
213            return Err(Error::DtdDetected);
214        }
215
216        parse_doctype(s, events)?;
217        parse_misc(s, events)?;
218    }
219
220    s.skip_spaces();
221    if s.curr_byte().ok() == Some(b'<') {
222        parse_element(s, events)?;
223    }
224
225    parse_misc(s, events)?;
226
227    if !s.at_end() {
228        return Err(Error::UnknownToken(s.gen_text_pos()));
229    }
230
231    Ok(())
232}
233
234// Misc ::= Comment | PI | S
235fn parse_misc<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
236    while !s.at_end() {
237        s.skip_spaces();
238        if s.starts_with(b"<!--") {
239            parse_comment(s, events)?;
240        } else if s.starts_with(b"<?") {
241            parse_pi(s, events)?;
242        } else {
243            break;
244        }
245    }
246
247    Ok(())
248}
249
250// XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
251//
252// We don't actually return a token for the XML declaration and only validate it.
253fn parse_declaration(s: &mut Stream) -> Result<()> {
254    fn consume_spaces(s: &mut Stream) -> Result<()> {
255        if s.starts_with_space() {
256            s.skip_spaces();
257        } else if !s.starts_with(b"?>") && !s.at_end() {
258            return Err(Error::InvalidChar2(
259                "a whitespace",
260                s.curr_byte_unchecked(),
261                s.gen_text_pos(),
262            ));
263        }
264
265        Ok(())
266    }
267
268    s.advance(5); // <?xml
269    consume_spaces(s)?;
270
271    // The `version` "attribute" is mandatory.
272    if !s.starts_with(b"version") {
273        // Will trigger the InvalidString error, which is what we want.
274        return s.skip_string(b"version");
275    }
276    let _ = parse_attribute(s)?;
277    consume_spaces(s)?;
278
279    if s.starts_with(b"encoding") {
280        let _ = parse_attribute(s)?;
281        consume_spaces(s)?;
282    }
283
284    if s.starts_with(b"standalone") {
285        let _ = parse_attribute(s)?;
286    }
287
288    s.skip_spaces();
289    s.skip_string(b"?>")?;
290
291    Ok(())
292}
293
294// '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
295fn parse_comment<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
296    let start = s.pos();
297    s.advance(4);
298    let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?;
299    s.skip_string(b"-->")?;
300
301    if text.contains("--") {
302        return Err(Error::InvalidComment(s.gen_text_pos_from(start)));
303    }
304
305    if text.ends_with('-') {
306        return Err(Error::InvalidComment(s.gen_text_pos_from(start)));
307    }
308
309    let range = s.range_from(start);
310    events.token(Token::Comment(text, range))?;
311
312    Ok(())
313}
314
315// PI       ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
316// PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
317fn parse_pi<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
318    if s.starts_with(b"<?xml ") {
319        return Err(Error::UnexpectedDeclaration(s.gen_text_pos()));
320    }
321
322    let start = s.pos();
323    s.advance(2);
324    let target = s.consume_name()?;
325    s.skip_spaces();
326    let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?;
327    let content = if !content.is_empty() {
328        Some(content)
329    } else {
330        None
331    };
332
333    s.skip_string(b"?>")?;
334
335    let range = s.range_from(start);
336    events.token(Token::ProcessingInstruction(target, content, range))?;
337    Ok(())
338}
339
340fn parse_doctype<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
341    let start = s.pos();
342    parse_doctype_start(s)?;
343    s.skip_spaces();
344
345    if s.curr_byte() == Ok(b'>') {
346        s.advance(1);
347        return Ok(());
348    }
349
350    s.advance(1); // [
351    while !s.at_end() {
352        s.skip_spaces();
353        if s.starts_with(b"<!ENTITY") {
354            parse_entity_decl(s, events)?;
355        } else if s.starts_with(b"<!--") {
356            parse_comment(s, events)?;
357        } else if s.starts_with(b"<?") {
358            parse_pi(s, events)?;
359        } else if s.starts_with(b"]") {
360            // DTD ends with ']' S? '>', therefore we have to skip possible spaces.
361            s.advance(1);
362            s.skip_spaces();
363            match s.curr_byte() {
364                Ok(b'>') => {
365                    s.advance(1);
366                    break;
367                }
368                Ok(c) => {
369                    return Err(Error::InvalidChar2("'>'", c, s.gen_text_pos()));
370                }
371                Err(_) => {
372                    return Err(Error::UnexpectedEndOfStream);
373                }
374            }
375        } else if s.starts_with(b"<!ELEMENT")
376            || s.starts_with(b"<!ATTLIST")
377            || s.starts_with(b"<!NOTATION")
378        {
379            if consume_decl(s).is_err() {
380                let pos = s.gen_text_pos_from(start);
381                return Err(Error::UnknownToken(pos));
382            }
383        } else {
384            return Err(Error::UnknownToken(s.gen_text_pos()));
385        }
386    }
387
388    Ok(())
389}
390
391// doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
392fn parse_doctype_start(s: &mut Stream) -> Result<()> {
393    s.advance(9);
394
395    s.consume_spaces()?;
396    s.skip_name()?;
397    s.skip_spaces();
398
399    let _ = parse_external_id(s)?;
400    s.skip_spaces();
401
402    let c = s.curr_byte()?;
403    if c != b'[' && c != b'>' {
404        return Err(Error::InvalidChar2("'[' or '>'", c, s.gen_text_pos()));
405    }
406
407    Ok(())
408}
409
410// ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
411fn parse_external_id(s: &mut Stream) -> Result<bool> {
412    let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") {
413        let start = s.pos();
414        s.advance(6);
415        let id = s.slice_back(start);
416
417        s.consume_spaces()?;
418        let quote = s.consume_quote()?;
419        let _ = s.consume_bytes(|c| c != quote);
420        s.consume_byte(quote)?;
421
422        if id == "SYSTEM" {
423            // Ok
424        } else {
425            s.consume_spaces()?;
426            let quote = s.consume_quote()?;
427            let _ = s.consume_bytes(|c| c != quote);
428            s.consume_byte(quote)?;
429        }
430
431        true
432    } else {
433        false
434    };
435
436    Ok(v)
437}
438
439// EntityDecl  ::= GEDecl | PEDecl
440// GEDecl      ::= '<!ENTITY' S Name S EntityDef S? '>'
441// PEDecl      ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
442fn parse_entity_decl<'input>(
443    s: &mut Stream<'input>,
444    events: &mut dyn XmlEvents<'input>,
445) -> Result<()> {
446    s.advance(8);
447    s.consume_spaces()?;
448
449    let is_ge = if s.try_consume_byte(b'%') {
450        s.consume_spaces()?;
451        false
452    } else {
453        true
454    };
455
456    let name = s.consume_name()?;
457    s.consume_spaces()?;
458    if let Some(definition) = parse_entity_def(s, is_ge)? {
459        events.token(Token::EntityDeclaration(name, definition))?;
460    }
461    s.skip_spaces();
462    s.consume_byte(b'>')?;
463
464    Ok(())
465}
466
467// EntityDef   ::= EntityValue | (ExternalID NDataDecl?)
468// PEDef       ::= EntityValue | ExternalID
469// EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' |  "'" ([^%&']
470//                             | PEReference | Reference)* "'"
471// ExternalID  ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
472// NDataDecl   ::= S 'NDATA' S Name
473fn parse_entity_def<'input>(
474    s: &mut Stream<'input>,
475    is_ge: bool,
476) -> Result<Option<StrSpan<'input>>> {
477    let c = s.curr_byte()?;
478    match c {
479        b'"' | b'\'' => {
480            let quote = s.consume_quote()?;
481            let start = s.pos();
482            s.skip_bytes(|c| c != quote);
483            let value = s.slice_back_span(start);
484            s.consume_byte(quote)?;
485            Ok(Some(value))
486        }
487        b'S' | b'P' => {
488            if parse_external_id(s)? {
489                if is_ge {
490                    s.skip_spaces();
491                    if s.starts_with(b"NDATA") {
492                        s.advance(5);
493                        s.consume_spaces()?;
494                        s.skip_name()?;
495                        // TODO: NDataDecl is not supported
496                    }
497                }
498
499                Ok(None)
500            } else {
501                Err(Error::InvalidExternalID(s.gen_text_pos()))
502            }
503        }
504        _ => {
505            let pos = s.gen_text_pos();
506            Err(Error::InvalidChar2("a quote, SYSTEM or PUBLIC", c, pos))
507        }
508    }
509}
510
511fn consume_decl(s: &mut Stream) -> Result<()> {
512    s.skip_bytes(|c| c != b'>');
513    s.consume_byte(b'>')?;
514    Ok(())
515}
516
517// element ::= EmptyElemTag | STag content ETag
518// '<' Name (S Attribute)* S? '>'
519fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
520    let start = s.pos();
521    s.advance(1); // <
522    let (prefix, local) = s.consume_qname()?;
523    events.token(Token::ElementStart(prefix, local, start))?;
524
525    let mut open = false;
526    while !s.at_end() {
527        let has_space = s.starts_with_space();
528        s.skip_spaces();
529        let start = s.pos();
530        match s.curr_byte()? {
531            b'/' => {
532                s.advance(1);
533                s.consume_byte(b'>')?;
534                let range = s.range_from(start);
535                events.token(Token::ElementEnd(ElementEnd::Empty, range))?;
536                break;
537            }
538            b'>' => {
539                s.advance(1);
540                let range = s.range_from(start);
541                events.token(Token::ElementEnd(ElementEnd::Open, range))?;
542                open = true;
543                break;
544            }
545            _ => {
546                // An attribute must be preceded with a whitespace.
547                if !has_space {
548                    // Will always trigger an error. Which is what we want.
549                    s.consume_spaces()?;
550                }
551
552                // Manual inlining of `parse_attribute` for performance.
553                // We cannot mark `parse_attribute` as `#[inline(always)]`
554                // because it will blow up the binary size.
555                let (prefix, local) = s.consume_qname()?;
556                let qname_end = s.pos();
557                let qname_len = u16::try_from(qname_end - start).unwrap_or(u16::MAX);
558                s.consume_eq()?;
559                let eq_len = u8::try_from(s.pos() - qname_end).unwrap_or(u8::MAX);
560                let quote = s.consume_quote()?;
561                let quote_c = quote as char;
562                // The attribute value must not contain the < character.
563                let value_start = s.pos();
564                s.skip_chars(|_, c| c != quote_c && c != '<')?;
565                let value = s.slice_back_span(value_start);
566                s.consume_byte(quote)?;
567                let end = s.pos();
568                events.token(Token::Attribute(start..end, qname_len, eq_len, prefix, local, value))?;
569            }
570        }
571    }
572
573    if open {
574        parse_content(s, events)?;
575    }
576
577    Ok(())
578}
579
580// Attribute ::= Name Eq AttValue
581fn parse_attribute<'input>(
582    s: &mut Stream<'input>,
583) -> Result<(&'input str, &'input str, StrSpan<'input>)> {
584    let (prefix, local) = s.consume_qname()?;
585    s.consume_eq()?;
586    let quote = s.consume_quote()?;
587    let quote_c = quote as char;
588    // The attribute value must not contain the < character.
589    let value_start = s.pos();
590    s.skip_chars(|_, c| c != quote_c && c != '<')?;
591    let value = s.slice_back_span(value_start);
592    s.consume_byte(quote)?;
593    Ok((prefix, local, value))
594}
595
596// content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
597pub fn parse_content<'input>(
598    s: &mut Stream<'input>,
599    events: &mut dyn XmlEvents<'input>,
600) -> Result<()> {
601    while !s.at_end() {
602        match s.curr_byte() {
603            Ok(b'<') => match s.next_byte() {
604                Ok(b'!') => {
605                    if s.starts_with(b"<!--") {
606                        parse_comment(s, events)?;
607                    } else if s.starts_with(b"<![CDATA[") {
608                        parse_cdata(s, events)?;
609                    } else {
610                        return Err(Error::UnknownToken(s.gen_text_pos()));
611                    }
612                }
613                Ok(b'?') => parse_pi(s, events)?,
614                Ok(b'/') => {
615                    parse_close_element(s, events)?;
616                    break;
617                }
618                Ok(_) => parse_element(s, events)?,
619                Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
620            },
621            Ok(_) => parse_text(s, events)?,
622            Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
623        }
624    }
625
626    Ok(())
627}
628
629// CDSect  ::= CDStart CData CDEnd
630// CDStart ::= '<![CDATA['
631// CData   ::= (Char* - (Char* ']]>' Char*))
632// CDEnd   ::= ']]>'
633fn parse_cdata<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
634    let start = s.pos();
635    s.advance(9); // <![CDATA[
636    let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?;
637    s.skip_string(b"]]>")?;
638    let range = s.range_from(start);
639    events.token(Token::Cdata(text, range))?;
640    Ok(())
641}
642
643// '</' Name S? '>'
644fn parse_close_element<'input>(
645    s: &mut Stream<'input>,
646    events: &mut dyn XmlEvents<'input>,
647) -> Result<()> {
648    let start = s.pos();
649    s.advance(2); // </
650
651    let (prefix, tag_name) = s.consume_qname()?;
652    s.skip_spaces();
653    s.consume_byte(b'>')?;
654
655    let range = s.range_from(start);
656    events.token(Token::ElementEnd(
657        ElementEnd::Close(prefix, tag_name),
658        range,
659    ))?;
660    Ok(())
661}
662
663fn parse_text<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
664    let start = s.pos();
665    let text = s.consume_chars(|_, c| c != '<')?;
666
667    // According to the spec, `]]>` must not appear inside a Text node.
668    // https://www.w3.org/TR/xml/#syntax
669    //
670    // Search for `>` first, since it's a bit faster than looking for `]]>`.
671    if text.contains('>') && text.contains("]]>") {
672        return Err(Error::InvalidCharacterData(s.gen_text_pos()));
673    }
674
675    let range = s.range_from(start);
676    events.token(Token::Text(text, range))?;
677    Ok(())
678}
679
680/// Representation of the [Reference](https://www.w3.org/TR/xml/#NT-Reference) value.
681#[derive(Clone, Copy)]
682pub enum Reference<'input> {
683    /// An entity reference.
684    ///
685    /// <https://www.w3.org/TR/xml/#NT-EntityRef>
686    Entity(&'input str),
687
688    /// A character reference.
689    ///
690    /// <https://www.w3.org/TR/xml/#NT-CharRef>
691    Char(char),
692}
693
694#[derive(Clone)]
695pub struct Stream<'input> {
696    pos: usize,
697    end: usize,
698    span: StrSpan<'input>,
699}
700
701impl<'input> Stream<'input> {
702    #[inline]
703    pub fn new(text: &'input str) -> Self {
704        Stream {
705            pos: 0,
706            end: text.len(),
707            span: text.into(),
708        }
709    }
710
711    #[inline]
712    pub fn from_substr(text: &'input str, fragment: Range<usize>) -> Self {
713        Stream {
714            pos: fragment.start,
715            end: fragment.end,
716            span: text.into(),
717        }
718    }
719
720    #[inline]
721    pub fn pos(&self) -> usize {
722        self.pos
723    }
724
725    #[inline]
726    pub fn at_end(&self) -> bool {
727        self.pos >= self.end
728    }
729
730    #[inline]
731    pub fn curr_byte(&self) -> Result<u8> {
732        if self.at_end() {
733            return Err(Error::UnexpectedEndOfStream);
734        }
735
736        Ok(self.curr_byte_unchecked())
737    }
738
739    #[inline]
740    pub fn curr_byte_unchecked(&self) -> u8 {
741        self.span.text.as_bytes()[self.pos]
742    }
743
744    #[inline]
745    fn next_byte(&self) -> Result<u8> {
746        if self.pos + 1 >= self.end {
747            return Err(Error::UnexpectedEndOfStream);
748        }
749
750        Ok(self.span.as_str().as_bytes()[self.pos + 1])
751    }
752
753    #[inline]
754    pub fn advance(&mut self, n: usize) {
755        debug_assert!(self.pos + n <= self.end);
756        self.pos += n;
757    }
758
759    #[inline]
760    fn starts_with(&self, text: &[u8]) -> bool {
761        self.span.text.as_bytes()[self.pos..self.end].starts_with(text)
762    }
763
764    fn consume_byte(&mut self, c: u8) -> Result<()> {
765        let curr = self.curr_byte()?;
766        if curr != c {
767            return Err(Error::InvalidChar(c, curr, self.gen_text_pos()));
768        }
769
770        self.advance(1);
771        Ok(())
772    }
773
774    // Unlike `consume_byte()` will not return any errors.
775    fn try_consume_byte(&mut self, c: u8) -> bool {
776        match self.curr_byte() {
777            Ok(b) if b == c => {
778                self.advance(1);
779                true
780            }
781            _ => false,
782        }
783    }
784
785    fn skip_string(&mut self, text: &'static [u8]) -> Result<()> {
786        if !self.starts_with(text) {
787            let pos = self.gen_text_pos();
788
789            // Assume that all input `text` are valid UTF-8 strings, so unwrap is safe.
790            let expected = str::from_utf8(text).unwrap();
791
792            return Err(Error::InvalidString(expected, pos));
793        }
794
795        self.advance(text.len());
796        Ok(())
797    }
798
799    #[inline]
800    fn consume_bytes<F: Fn(u8) -> bool>(&mut self, f: F) -> &'input str {
801        let start = self.pos;
802        self.skip_bytes(f);
803        self.slice_back(start)
804    }
805
806    fn skip_bytes<F: Fn(u8) -> bool>(&mut self, f: F) {
807        while !self.at_end() && f(self.curr_byte_unchecked()) {
808            self.advance(1);
809        }
810    }
811
812    #[inline]
813    fn consume_chars<F>(&mut self, f: F) -> Result<&'input str>
814    where
815        F: Fn(&Stream, char) -> bool,
816    {
817        let start = self.pos;
818        self.skip_chars(f)?;
819        Ok(self.slice_back(start))
820    }
821
822    #[inline]
823    fn skip_chars<F>(&mut self, f: F) -> Result<()>
824    where
825        F: Fn(&Stream, char) -> bool,
826    {
827        for c in self.chars() {
828            if !c.is_xml_char() {
829                return Err(Error::NonXmlChar(c, self.gen_text_pos()));
830            } else if f(self, c) {
831                self.advance(c.len_utf8());
832            } else {
833                break;
834            }
835        }
836
837        Ok(())
838    }
839
840    #[inline]
841    fn chars(&self) -> str::Chars<'input> {
842        self.span.as_str()[self.pos..self.end].chars()
843    }
844
845    #[inline]
846    fn slice_back(&self, pos: usize) -> &'input str {
847        self.span.slice_region(pos, self.pos)
848    }
849
850    #[inline]
851    fn slice_back_span(&self, pos: usize) -> StrSpan<'input> {
852        StrSpan::from_substr(self.span.text, pos, self.pos)
853    }
854
855    #[inline]
856    fn range_from(&self, start: usize) -> Range<usize> {
857        start..self.pos
858    }
859
860    #[inline]
861    fn skip_spaces(&mut self) {
862        while self.starts_with_space() {
863            self.advance(1);
864        }
865    }
866
867    #[inline]
868    fn starts_with_space(&self) -> bool {
869        !self.at_end() && self.curr_byte_unchecked().is_xml_space()
870    }
871
872    // Like `skip_spaces()`, but checks that first char is actually a space.
873    fn consume_spaces(&mut self) -> Result<()> {
874        if self.at_end() {
875            return Err(Error::UnexpectedEndOfStream);
876        }
877
878        if !self.starts_with_space() {
879            return Err(Error::InvalidChar2(
880                "a whitespace",
881                self.curr_byte_unchecked(),
882                self.gen_text_pos(),
883            ));
884        }
885
886        self.skip_spaces();
887        Ok(())
888    }
889
890    /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Reference>
891    pub fn try_consume_reference(&mut self) -> Option<Reference<'input>> {
892        let start = self.pos();
893
894        // Consume reference on a substream.
895        let mut s = self.clone();
896        let result = s.consume_reference()?;
897
898        // If the current data is a reference than advance the current stream
899        // by number of bytes read by substream.
900        self.advance(s.pos() - start);
901        Some(result)
902    }
903
904    #[inline(never)]
905    fn consume_reference(&mut self) -> Option<Reference<'input>> {
906        if !self.try_consume_byte(b'&') {
907            return None;
908        }
909
910        let reference = if self.try_consume_byte(b'#') {
911            let (value, radix) = if self.try_consume_byte(b'x') {
912                let value =
913                    self.consume_bytes(|c| matches!(c, b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f'));
914                (value, 16)
915            } else {
916                let value = self.consume_bytes(|c| c.is_ascii_digit());
917                (value, 10)
918            };
919
920            let n = u32::from_str_radix(value, radix).ok()?;
921
922            let c = char::from_u32(n).unwrap_or('\u{FFFD}');
923            if !c.is_xml_char() {
924                return None;
925            }
926
927            Reference::Char(c)
928        } else {
929            let name = self.consume_name().ok()?;
930            match name {
931                "quot" => Reference::Char('"'),
932                "amp" => Reference::Char('&'),
933                "apos" => Reference::Char('\''),
934                "lt" => Reference::Char('<'),
935                "gt" => Reference::Char('>'),
936                _ => Reference::Entity(name),
937            }
938        };
939
940        self.consume_byte(b';').ok()?;
941
942        Some(reference)
943    }
944
945    /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Name>
946    fn consume_name(&mut self) -> Result<&'input str> {
947        let start = self.pos();
948        self.skip_name()?;
949
950        let name = self.slice_back(start);
951        if name.is_empty() {
952            return Err(Error::InvalidName(self.gen_text_pos_from(start)));
953        }
954
955        Ok(name)
956    }
957
958    /// The same as `consume_name()`, but does not return a consumed name.
959    fn skip_name(&mut self) -> Result<()> {
960        let start = self.pos();
961        let mut iter = self.chars();
962        if let Some(c) = iter.next() {
963            if c.is_xml_name_start() {
964                self.advance(c.len_utf8());
965            } else {
966                return Err(Error::InvalidName(self.gen_text_pos_from(start)));
967            }
968        }
969
970        for c in iter {
971            if c.is_xml_name() {
972                self.advance(c.len_utf8());
973            } else {
974                break;
975            }
976        }
977
978        Ok(())
979    }
980
981    /// Consumes a qualified XML name and returns it.
982    ///
983    /// Consumes according to: <https://www.w3.org/TR/xml-names/#ns-qualnames>
984    #[inline(never)]
985    fn consume_qname(&mut self) -> Result<(&'input str, &'input str)> {
986        let start = self.pos();
987
988        let mut splitter = None;
989
990        while !self.at_end() {
991            // Check for ASCII first for performance reasons.
992            let b = self.curr_byte_unchecked();
993            if b < 128 {
994                if b == b':' {
995                    if splitter.is_none() {
996                        splitter = Some(self.pos());
997                        self.advance(1);
998                    } else {
999                        // Multiple `:` is an error.
1000                        return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1001                    }
1002                } else if b.is_xml_name() {
1003                    self.advance(1);
1004                } else {
1005                    break;
1006                }
1007            } else {
1008                // Fallback to Unicode code point.
1009                match self.chars().nth(0) {
1010                    Some(c) if c.is_xml_name() => {
1011                        self.advance(c.len_utf8());
1012                    }
1013                    _ => break,
1014                }
1015            }
1016        }
1017
1018        let (prefix, local) = if let Some(splitter) = splitter {
1019            let prefix = self.span.slice_region(start, splitter);
1020            let local = self.slice_back(splitter + 1);
1021            (prefix, local)
1022        } else {
1023            let local = self.slice_back(start);
1024            // Slice an empty prefix. This way we can preserve attribute start position.
1025            (self.span.slice_region(start, start), local)
1026        };
1027
1028        // Prefix must start with a `NameStartChar`.
1029        if let Some(c) = prefix.chars().nth(0) {
1030            if !c.is_xml_name_start() {
1031                return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1032            }
1033        }
1034
1035        // Local name must start with a `NameStartChar`.
1036        if let Some(c) = local.chars().nth(0) {
1037            if !c.is_xml_name_start() {
1038                return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1039            }
1040        } else {
1041            // If empty - error.
1042            return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1043        }
1044
1045        Ok((prefix, local))
1046    }
1047
1048    fn consume_eq(&mut self) -> Result<()> {
1049        self.skip_spaces();
1050        self.consume_byte(b'=')?;
1051        self.skip_spaces();
1052
1053        Ok(())
1054    }
1055
1056    fn consume_quote(&mut self) -> Result<u8> {
1057        let c = self.curr_byte()?;
1058        if c == b'\'' || c == b'"' {
1059            self.advance(1);
1060            Ok(c)
1061        } else {
1062            Err(Error::InvalidChar2("a quote", c, self.gen_text_pos()))
1063        }
1064    }
1065
1066    /// Calculates a current absolute position.
1067    ///
1068    /// This operation is very expensive. Use only for errors.
1069    #[inline(never)]
1070    pub fn gen_text_pos(&self) -> TextPos {
1071        let text = self.span.as_str();
1072        let end = self.pos;
1073
1074        let row = Self::calc_curr_row(text, end);
1075        let col = Self::calc_curr_col(text, end);
1076        TextPos::new(row, col)
1077    }
1078
1079    /// Calculates an absolute position at `pos`.
1080    ///
1081    /// This operation is very expensive. Use only for errors.
1082    #[inline(never)]
1083    pub fn gen_text_pos_from(&self, pos: usize) -> TextPos {
1084        let mut s = self.clone();
1085        s.pos = core::cmp::min(pos, s.span.as_str().len());
1086        s.gen_text_pos()
1087    }
1088
1089    fn calc_curr_row(text: &str, end: usize) -> u32 {
1090        let mut row = 1;
1091        for c in &text.as_bytes()[..end] {
1092            if *c == b'\n' {
1093                row += 1;
1094            }
1095        }
1096
1097        row
1098    }
1099
1100    fn calc_curr_col(text: &str, end: usize) -> u32 {
1101        let mut col = 1;
1102        for c in text[..end].chars().rev() {
1103            if c == '\n' {
1104                break;
1105            } else {
1106                col += 1;
1107            }
1108        }
1109
1110        col
1111    }
1112}
roxmltree/tokenizer.rs

roxmltree/
tokenizer.rs