symphonia_core/
probe.rs

1// Symphonia
2// Copyright (c) 2019-2022 The Project Symphonia Developers.
3//
4// This Source Code Form is subject to the terms of the Mozilla Public
5// License, v. 2.0. If a copy of the MPL was not distributed with this
6// file, You can obtain one at https://mozilla.org/MPL/2.0/.
7
8//! The `probe` module provides methods and traits to support auto-detection of media formats from
9//! arbitrary media streams.
10
11use crate::errors::{unsupported_error, Result};
12use crate::formats::{FormatOptions, FormatReader};
13use crate::io::{MediaSourceStream, ReadBytes, SeekBuffered};
14use crate::meta::{Metadata, MetadataLog, MetadataOptions, MetadataReader};
15
16use log::{debug, error, info};
17
18mod bloom {
19
20    fn fnv1a32(value: &[u8; 2]) -> u32 {
21        const INIT: u32 = 0x811c_9dc5;
22        const PRIME: u32 = 0x0100_0193;
23
24        let mut state = INIT;
25
26        for byte in value.iter() {
27            state = (state ^ u32::from(*byte)).wrapping_mul(PRIME);
28        }
29
30        state
31    }
32
33    pub struct BloomFilter {
34        filter: Box<[u64]>,
35    }
36
37    impl Default for BloomFilter {
38        fn default() -> Self {
39            BloomFilter { filter: vec![0; BloomFilter::M >> 6].into_boxed_slice() }
40        }
41    }
42
43    impl BloomFilter {
44        /// The number of bits, m, used by the bloom filter. Use 16384 bits (2KiB) by default.
45        const M: usize = 2 * 1024 * 8;
46
47        pub fn insert(&mut self, key: &[u8; 2]) {
48            let hash = fnv1a32(key);
49
50            let h0 = (hash >> 16) as u16;
51            let h1 = (hash >> 0) as u16;
52
53            let i0 = h0 as usize & (BloomFilter::M - 1);
54            let i1 = h0.wrapping_add(h1.wrapping_mul(1)) as usize & (BloomFilter::M - 1);
55            let i2 = h0.wrapping_add(h1.wrapping_mul(2)) as usize & (BloomFilter::M - 1);
56
57            self.filter[i0 >> 6] |= 1 << (i0 & 63);
58            self.filter[i1 >> 6] |= 1 << (i1 & 63);
59            self.filter[i2 >> 6] |= 1 << (i2 & 63);
60        }
61
62        pub fn may_contain(&self, key: &[u8; 2]) -> bool {
63            let hash = fnv1a32(key);
64
65            let h0 = (hash >> 16) as u16;
66            let h1 = (hash >> 0) as u16;
67
68            let i0 = h0 as usize & (BloomFilter::M - 1);
69            let i1 = h0.wrapping_add(h1.wrapping_mul(1)) as usize & (BloomFilter::M - 1);
70            let i2 = h0.wrapping_add(h1.wrapping_mul(2)) as usize & (BloomFilter::M - 1);
71
72            if (self.filter[i0 >> 6] & (1 << (i0 & 63))) == 0 {
73                return false;
74            }
75            if (self.filter[i1 >> 6] & (1 << (i1 & 63))) == 0 {
76                return false;
77            }
78            if (self.filter[i2 >> 6] & (1 << (i2 & 63))) == 0 {
79                return false;
80            }
81
82            true
83        }
84    }
85}
86
87/// `Instantiate` is an enumeration of instantiation functions used by `Descriptor` and `Probe` to
88/// instantiate metadata and container format readers.
89#[derive(Copy, Clone)]
90pub enum Instantiate {
91    /// Instantiation function for a `FormatReader`.
92    Format(fn(MediaSourceStream, &FormatOptions) -> Result<Box<dyn FormatReader>>),
93    /// Instantiation function for a `MetadataReader`.
94    Metadata(fn(&MetadataOptions) -> Box<dyn MetadataReader>),
95}
96
97/// `Descriptor` provides declarative information about container and metadata formats.
98/// `Descriptor`s are used by `Probe` and related machinery to scan a `MediaSourceStream` for media.
99#[derive(Copy, Clone)]
100pub struct Descriptor {
101    /// A short ASCII-only string identifying the codec.
102    pub short_name: &'static str,
103    /// A longer, more descriptive, string identifying the codec.
104    pub long_name: &'static str,
105    /// A list of case-insensitive file extensions that are generally used by the format.
106    pub extensions: &'static [&'static str],
107    /// A list of case-insensitive MIME types that are generally used by the format.
108    pub mime_types: &'static [&'static str],
109    /// A byte-string start-of-stream marker that will be searched for within the stream.
110    pub markers: &'static [&'static [u8]],
111    /// A function to score a context buffer.
112    pub score: fn(&[u8]) -> u8,
113    /// An instantiation function.
114    pub inst: Instantiate,
115}
116
117/// The `QueryDescriptor` trait indicates that the implementer may be registered and capable of
118/// probing.
119pub trait QueryDescriptor {
120    /// Returns a list of descriptors.
121    fn query() -> &'static [Descriptor];
122
123    /// Using the provided context buffer, score calculate and returns a value between 0 and 255
124    /// indicating the confidence of the reader in decoding or parsing the source stream.
125    fn score(context: &[u8]) -> u8;
126}
127
128/// A `Hint` provides additional information and context when probing a media source stream.
129///
130/// For example, the `Probe` cannot examine the extension or mime-type of the media because
131/// `MediaSourceStream` abstracts away such details. However, the embedder may have this information
132/// from a file path, HTTP header, email  attachment metadata, etc. `Hint`s are optional, and won't
133/// lead the probe astray if they're wrong, but they may provide an informed initial guess and
134/// optimize the guessing process siginificantly especially as more formats are registered.
135#[derive(Clone, Debug, Default)]
136pub struct Hint {
137    extension: Option<String>,
138    mime_type: Option<String>,
139}
140
141impl Hint {
142    /// Instantiate an empty `Hint`.
143    pub fn new() -> Self {
144        Hint { extension: None, mime_type: None }
145    }
146
147    /// Add a file extension `Hint`.
148    pub fn with_extension(&mut self, extension: &str) -> &mut Self {
149        self.extension = Some(extension.to_owned());
150        self
151    }
152
153    /// Add a MIME/Media-type `Hint`.
154    pub fn mime_type(&mut self, mime_type: &str) -> &mut Self {
155        self.mime_type = Some(mime_type.to_owned());
156        self
157    }
158}
159
160/// Metadata that came from the `metadata` field of [`ProbeResult`].
161pub struct ProbedMetadata {
162    metadata: Option<MetadataLog>,
163}
164
165impl ProbedMetadata {
166    /// Returns the metadata that was found during probing.
167    ///
168    /// If any additional metadata was present outside of the container, this is
169    /// `Some` and the log will have at least one item in it.
170    pub fn get(&mut self) -> Option<Metadata<'_>> {
171        self.metadata.as_mut().map(|m| m.metadata())
172    }
173
174    /// Returns the inner metadata log, if it was present.
175    pub fn into_inner(self) -> Option<MetadataLog> {
176        self.metadata
177    }
178}
179
180/// `ProbeResult` contains the result of a format probe operation.
181pub struct ProbeResult {
182    /// An instance of a `FormatReader` for the probed format
183    pub format: Box<dyn FormatReader>,
184    /// A log of `Metadata` revisions read during the probe operation before the instantiation of
185    /// the `FormatReader`.
186    ///
187    /// Metadata that was part of the container format itself can be read by calling `.metadata()`
188    /// on `format`.
189    pub metadata: ProbedMetadata,
190}
191
192/// `Probe` scans a `MediaSourceStream` for metadata and container formats, and provides an
193/// iterator-like interface to instantiate readers for the formats encountered.
194#[derive(Default)]
195pub struct Probe {
196    filter: bloom::BloomFilter,
197    registered: Vec<Descriptor>,
198}
199
200impl Probe {
201    const PROBE_SEARCH_LIMIT: u64 = 1 * 1024 * 1024;
202
203    /// Register all `Descriptor`s supported by the parameterized type.
204    pub fn register_all<Q: QueryDescriptor>(&mut self) {
205        for descriptor in Q::query() {
206            self.register(descriptor);
207        }
208    }
209
210    /// Register a single `Descriptor`.
211    pub fn register(&mut self, descriptor: &Descriptor) {
212        // Insert 2-byte prefixes for each marker into the bloom filter.
213        for marker in descriptor.markers {
214            let mut prefix = [0u8; 2];
215
216            match marker.len() {
217                2..=16 => prefix.copy_from_slice(&marker[0..2]),
218                _ => panic!("invalid marker length (only 2-16 bytes supported)."),
219            }
220
221            self.filter.insert(&prefix);
222        }
223
224        self.registered.push(*descriptor);
225    }
226
227    /// Searches the provided `MediaSourceStream` for metadata or a container format.
228    pub fn next(&self, mss: &mut MediaSourceStream) -> Result<Instantiate> {
229        let mut win = 0u16;
230
231        let init_pos = mss.pos();
232        let mut count = 0;
233
234        // Scan the stream byte-by-byte. Shifting each byte through a 2-byte window.
235        while let Ok(byte) = mss.read_byte() {
236            win = (win << 8) | u16::from(byte);
237
238            count += 1;
239
240            if count > Probe::PROBE_SEARCH_LIMIT {
241                break;
242            }
243
244            if count % 4096 == 0 {
245                debug!(
246                    "searching for format marker... {}+{} / {} bytes.",
247                    init_pos,
248                    count,
249                    Probe::PROBE_SEARCH_LIMIT
250                );
251            }
252
253            // Use the bloom filter to check if the the window may be a prefix of a registered
254            // marker.
255            if self.filter.may_contain(&win.to_be_bytes()) {
256                // Using the 2-byte window, and a further 14 bytes, create a larger 16-byte window.
257                let mut context = [0u8; 16];
258
259                context[0..2].copy_from_slice(&win.to_be_bytes()[0..2]);
260                mss.read_buf_exact(&mut context[2..])?;
261
262                debug!(
263                    "found a possible format marker within {:x?} @ {}+{} bytes.",
264                    context, init_pos, count,
265                );
266
267                // Search for registered markers in the 16-byte window.
268                for registered in &self.registered {
269                    for marker in registered.markers {
270                        let len = marker.len();
271
272                        // If a match is found, return the instantiate.
273                        if context[0..len] == **marker {
274                            // Re-align the stream to the start of the marker.
275                            mss.seek_buffered_rev(16);
276
277                            // TODO: Implement scoring.
278
279                            info!(
280                                "found the format marker {:x?} @ {}+{} bytes.",
281                                &context[0..len],
282                                init_pos,
283                                count,
284                            );
285
286                            return Ok(registered.inst);
287                        }
288                    }
289                }
290
291                // If no registered markers were matched, then the bloom filter returned a false
292                // positive. Re-align the stream to the end of the 2-byte window and continue the
293                // search.
294                mss.seek_buffered_rev(16 - 2);
295            }
296        }
297
298        if count < Probe::PROBE_SEARCH_LIMIT {
299            error!("probe reach EOF at {} bytes.", count);
300        }
301        else {
302            // Could not find any marker within the probe limit.
303            error!("reached probe limit of {} bytes.", Probe::PROBE_SEARCH_LIMIT);
304        }
305
306        unsupported_error("core (probe): no suitable format reader found")
307    }
308
309    /// Searches the provided `MediaSourceStream` for a container format. Any metadata that is read
310    /// during the search will be queued and attached to the `FormatReader` instance once a
311    /// container format is found.
312    pub fn format(
313        &self,
314        _hint: &Hint,
315        mut mss: MediaSourceStream,
316        format_opts: &FormatOptions,
317        metadata_opts: &MetadataOptions,
318    ) -> Result<ProbeResult> {
319        let mut metadata: MetadataLog = Default::default();
320
321        // Loop over all elements in the stream until a container format is found.
322        loop {
323            match self.next(&mut mss)? {
324                // If a container format is found, return an instance to it's reader.
325                Instantiate::Format(fmt) => {
326                    let format = fmt(mss, format_opts)?;
327
328                    let metadata =
329                        if metadata.metadata().current().is_some() { Some(metadata) } else { None };
330
331                    return Ok(ProbeResult { format, metadata: ProbedMetadata { metadata } });
332                }
333                // If metadata was found, instantiate the metadata reader, read the metadata, and
334                // push it onto the metadata log.
335                Instantiate::Metadata(meta) => {
336                    let mut reader = meta(metadata_opts);
337                    metadata.push(reader.read_all(&mut mss)?);
338
339                    debug!("chaining a metadata element.");
340                }
341            }
342        }
343
344        // This function returns when either the end-of-stream is reached, an error occurs, or a
345        // container format is found.
346    }
347}
348
349/// Convenience macro for declaring a probe `Descriptor` for a `FormatReader`.
350#[macro_export]
351macro_rules! support_format {
352    ($short_name:expr, $long_name:expr, $exts:expr, $mimes:expr, $markers:expr) => {
353        Descriptor {
354            short_name: $short_name,
355            long_name: $long_name,
356            extensions: $exts,
357            mime_types: $mimes,
358            markers: $markers,
359            score: Self::score,
360            inst: Instantiate::Format(|source, opt| Ok(Box::new(Self::try_new(source, &opt)?))),
361        }
362    };
363}
364
365/// Convenience macro for declaring a probe `Descriptor` for a `MetadataReader`.
366#[macro_export]
367macro_rules! support_metadata {
368    ($short_name:expr, $long_name:expr, $exts:expr, $mimes:expr, $markers:expr) => {
369        Descriptor {
370            short_name: $short_name,
371            long_name: $long_name,
372            extensions: $exts,
373            mime_types: $mimes,
374            markers: $markers,
375            score: Self::score,
376            inst: Instantiate::Metadata(|opt| Box::new(Self::new(&opt))),
377        }
378    };
379}