symphonia_core/probe.rs
1// Symphonia
2// Copyright (c) 2019-2022 The Project Symphonia Developers.
3//
4// This Source Code Form is subject to the terms of the Mozilla Public
5// License, v. 2.0. If a copy of the MPL was not distributed with this
6// file, You can obtain one at https://mozilla.org/MPL/2.0/.
7
8//! The `probe` module provides methods and traits to support auto-detection of media formats from
9//! arbitrary media streams.
10
11use crate::errors::{unsupported_error, Result};
12use crate::formats::{FormatOptions, FormatReader};
13use crate::io::{MediaSourceStream, ReadBytes, SeekBuffered};
14use crate::meta::{Metadata, MetadataLog, MetadataOptions, MetadataReader};
15
16use log::{debug, error, info};
17
18mod bloom {
19
20 fn fnv1a32(value: &[u8; 2]) -> u32 {
21 const INIT: u32 = 0x811c_9dc5;
22 const PRIME: u32 = 0x0100_0193;
23
24 let mut state = INIT;
25
26 for byte in value.iter() {
27 state = (state ^ u32::from(*byte)).wrapping_mul(PRIME);
28 }
29
30 state
31 }
32
33 pub struct BloomFilter {
34 filter: Box<[u64]>,
35 }
36
37 impl Default for BloomFilter {
38 fn default() -> Self {
39 BloomFilter { filter: vec![0; BloomFilter::M >> 6].into_boxed_slice() }
40 }
41 }
42
43 impl BloomFilter {
44 /// The number of bits, m, used by the bloom filter. Use 16384 bits (2KiB) by default.
45 const M: usize = 2 * 1024 * 8;
46
47 pub fn insert(&mut self, key: &[u8; 2]) {
48 let hash = fnv1a32(key);
49
50 let h0 = (hash >> 16) as u16;
51 let h1 = (hash >> 0) as u16;
52
53 let i0 = h0 as usize & (BloomFilter::M - 1);
54 let i1 = h0.wrapping_add(h1.wrapping_mul(1)) as usize & (BloomFilter::M - 1);
55 let i2 = h0.wrapping_add(h1.wrapping_mul(2)) as usize & (BloomFilter::M - 1);
56
57 self.filter[i0 >> 6] |= 1 << (i0 & 63);
58 self.filter[i1 >> 6] |= 1 << (i1 & 63);
59 self.filter[i2 >> 6] |= 1 << (i2 & 63);
60 }
61
62 pub fn may_contain(&self, key: &[u8; 2]) -> bool {
63 let hash = fnv1a32(key);
64
65 let h0 = (hash >> 16) as u16;
66 let h1 = (hash >> 0) as u16;
67
68 let i0 = h0 as usize & (BloomFilter::M - 1);
69 let i1 = h0.wrapping_add(h1.wrapping_mul(1)) as usize & (BloomFilter::M - 1);
70 let i2 = h0.wrapping_add(h1.wrapping_mul(2)) as usize & (BloomFilter::M - 1);
71
72 if (self.filter[i0 >> 6] & (1 << (i0 & 63))) == 0 {
73 return false;
74 }
75 if (self.filter[i1 >> 6] & (1 << (i1 & 63))) == 0 {
76 return false;
77 }
78 if (self.filter[i2 >> 6] & (1 << (i2 & 63))) == 0 {
79 return false;
80 }
81
82 true
83 }
84 }
85}
86
87/// `Instantiate` is an enumeration of instantiation functions used by `Descriptor` and `Probe` to
88/// instantiate metadata and container format readers.
89#[derive(Copy, Clone)]
90pub enum Instantiate {
91 /// Instantiation function for a `FormatReader`.
92 Format(fn(MediaSourceStream, &FormatOptions) -> Result<Box<dyn FormatReader>>),
93 /// Instantiation function for a `MetadataReader`.
94 Metadata(fn(&MetadataOptions) -> Box<dyn MetadataReader>),
95}
96
97/// `Descriptor` provides declarative information about container and metadata formats.
98/// `Descriptor`s are used by `Probe` and related machinery to scan a `MediaSourceStream` for media.
99#[derive(Copy, Clone)]
100pub struct Descriptor {
101 /// A short ASCII-only string identifying the codec.
102 pub short_name: &'static str,
103 /// A longer, more descriptive, string identifying the codec.
104 pub long_name: &'static str,
105 /// A list of case-insensitive file extensions that are generally used by the format.
106 pub extensions: &'static [&'static str],
107 /// A list of case-insensitive MIME types that are generally used by the format.
108 pub mime_types: &'static [&'static str],
109 /// A byte-string start-of-stream marker that will be searched for within the stream.
110 pub markers: &'static [&'static [u8]],
111 /// A function to score a context buffer.
112 pub score: fn(&[u8]) -> u8,
113 /// An instantiation function.
114 pub inst: Instantiate,
115}
116
117/// The `QueryDescriptor` trait indicates that the implementer may be registered and capable of
118/// probing.
119pub trait QueryDescriptor {
120 /// Returns a list of descriptors.
121 fn query() -> &'static [Descriptor];
122
123 /// Using the provided context buffer, score calculate and returns a value between 0 and 255
124 /// indicating the confidence of the reader in decoding or parsing the source stream.
125 fn score(context: &[u8]) -> u8;
126}
127
128/// A `Hint` provides additional information and context when probing a media source stream.
129///
130/// For example, the `Probe` cannot examine the extension or mime-type of the media because
131/// `MediaSourceStream` abstracts away such details. However, the embedder may have this information
132/// from a file path, HTTP header, email attachment metadata, etc. `Hint`s are optional, and won't
133/// lead the probe astray if they're wrong, but they may provide an informed initial guess and
134/// optimize the guessing process siginificantly especially as more formats are registered.
135#[derive(Clone, Debug, Default)]
136pub struct Hint {
137 extension: Option<String>,
138 mime_type: Option<String>,
139}
140
141impl Hint {
142 /// Instantiate an empty `Hint`.
143 pub fn new() -> Self {
144 Hint { extension: None, mime_type: None }
145 }
146
147 /// Add a file extension `Hint`.
148 pub fn with_extension(&mut self, extension: &str) -> &mut Self {
149 self.extension = Some(extension.to_owned());
150 self
151 }
152
153 /// Add a MIME/Media-type `Hint`.
154 pub fn mime_type(&mut self, mime_type: &str) -> &mut Self {
155 self.mime_type = Some(mime_type.to_owned());
156 self
157 }
158}
159
160/// Metadata that came from the `metadata` field of [`ProbeResult`].
161pub struct ProbedMetadata {
162 metadata: Option<MetadataLog>,
163}
164
165impl ProbedMetadata {
166 /// Returns the metadata that was found during probing.
167 ///
168 /// If any additional metadata was present outside of the container, this is
169 /// `Some` and the log will have at least one item in it.
170 pub fn get(&mut self) -> Option<Metadata<'_>> {
171 self.metadata.as_mut().map(|m| m.metadata())
172 }
173
174 /// Returns the inner metadata log, if it was present.
175 pub fn into_inner(self) -> Option<MetadataLog> {
176 self.metadata
177 }
178}
179
180/// `ProbeResult` contains the result of a format probe operation.
181pub struct ProbeResult {
182 /// An instance of a `FormatReader` for the probed format
183 pub format: Box<dyn FormatReader>,
184 /// A log of `Metadata` revisions read during the probe operation before the instantiation of
185 /// the `FormatReader`.
186 ///
187 /// Metadata that was part of the container format itself can be read by calling `.metadata()`
188 /// on `format`.
189 pub metadata: ProbedMetadata,
190}
191
192/// `Probe` scans a `MediaSourceStream` for metadata and container formats, and provides an
193/// iterator-like interface to instantiate readers for the formats encountered.
194#[derive(Default)]
195pub struct Probe {
196 filter: bloom::BloomFilter,
197 registered: Vec<Descriptor>,
198}
199
200impl Probe {
201 const PROBE_SEARCH_LIMIT: u64 = 1 * 1024 * 1024;
202
203 /// Register all `Descriptor`s supported by the parameterized type.
204 pub fn register_all<Q: QueryDescriptor>(&mut self) {
205 for descriptor in Q::query() {
206 self.register(descriptor);
207 }
208 }
209
210 /// Register a single `Descriptor`.
211 pub fn register(&mut self, descriptor: &Descriptor) {
212 // Insert 2-byte prefixes for each marker into the bloom filter.
213 for marker in descriptor.markers {
214 let mut prefix = [0u8; 2];
215
216 match marker.len() {
217 2..=16 => prefix.copy_from_slice(&marker[0..2]),
218 _ => panic!("invalid marker length (only 2-16 bytes supported)."),
219 }
220
221 self.filter.insert(&prefix);
222 }
223
224 self.registered.push(*descriptor);
225 }
226
227 /// Searches the provided `MediaSourceStream` for metadata or a container format.
228 pub fn next(&self, mss: &mut MediaSourceStream) -> Result<Instantiate> {
229 let mut win = 0u16;
230
231 let init_pos = mss.pos();
232 let mut count = 0;
233
234 // Scan the stream byte-by-byte. Shifting each byte through a 2-byte window.
235 while let Ok(byte) = mss.read_byte() {
236 win = (win << 8) | u16::from(byte);
237
238 count += 1;
239
240 if count > Probe::PROBE_SEARCH_LIMIT {
241 break;
242 }
243
244 if count % 4096 == 0 {
245 debug!(
246 "searching for format marker... {}+{} / {} bytes.",
247 init_pos,
248 count,
249 Probe::PROBE_SEARCH_LIMIT
250 );
251 }
252
253 // Use the bloom filter to check if the the window may be a prefix of a registered
254 // marker.
255 if self.filter.may_contain(&win.to_be_bytes()) {
256 // Using the 2-byte window, and a further 14 bytes, create a larger 16-byte window.
257 let mut context = [0u8; 16];
258
259 context[0..2].copy_from_slice(&win.to_be_bytes()[0..2]);
260 mss.read_buf_exact(&mut context[2..])?;
261
262 debug!(
263 "found a possible format marker within {:x?} @ {}+{} bytes.",
264 context, init_pos, count,
265 );
266
267 // Search for registered markers in the 16-byte window.
268 for registered in &self.registered {
269 for marker in registered.markers {
270 let len = marker.len();
271
272 // If a match is found, return the instantiate.
273 if context[0..len] == **marker {
274 // Re-align the stream to the start of the marker.
275 mss.seek_buffered_rev(16);
276
277 // TODO: Implement scoring.
278
279 info!(
280 "found the format marker {:x?} @ {}+{} bytes.",
281 &context[0..len],
282 init_pos,
283 count,
284 );
285
286 return Ok(registered.inst);
287 }
288 }
289 }
290
291 // If no registered markers were matched, then the bloom filter returned a false
292 // positive. Re-align the stream to the end of the 2-byte window and continue the
293 // search.
294 mss.seek_buffered_rev(16 - 2);
295 }
296 }
297
298 if count < Probe::PROBE_SEARCH_LIMIT {
299 error!("probe reach EOF at {} bytes.", count);
300 }
301 else {
302 // Could not find any marker within the probe limit.
303 error!("reached probe limit of {} bytes.", Probe::PROBE_SEARCH_LIMIT);
304 }
305
306 unsupported_error("core (probe): no suitable format reader found")
307 }
308
309 /// Searches the provided `MediaSourceStream` for a container format. Any metadata that is read
310 /// during the search will be queued and attached to the `FormatReader` instance once a
311 /// container format is found.
312 pub fn format(
313 &self,
314 _hint: &Hint,
315 mut mss: MediaSourceStream,
316 format_opts: &FormatOptions,
317 metadata_opts: &MetadataOptions,
318 ) -> Result<ProbeResult> {
319 let mut metadata: MetadataLog = Default::default();
320
321 // Loop over all elements in the stream until a container format is found.
322 loop {
323 match self.next(&mut mss)? {
324 // If a container format is found, return an instance to it's reader.
325 Instantiate::Format(fmt) => {
326 let format = fmt(mss, format_opts)?;
327
328 let metadata =
329 if metadata.metadata().current().is_some() { Some(metadata) } else { None };
330
331 return Ok(ProbeResult { format, metadata: ProbedMetadata { metadata } });
332 }
333 // If metadata was found, instantiate the metadata reader, read the metadata, and
334 // push it onto the metadata log.
335 Instantiate::Metadata(meta) => {
336 let mut reader = meta(metadata_opts);
337 metadata.push(reader.read_all(&mut mss)?);
338
339 debug!("chaining a metadata element.");
340 }
341 }
342 }
343
344 // This function returns when either the end-of-stream is reached, an error occurs, or a
345 // container format is found.
346 }
347}
348
349/// Convenience macro for declaring a probe `Descriptor` for a `FormatReader`.
350#[macro_export]
351macro_rules! support_format {
352 ($short_name:expr, $long_name:expr, $exts:expr, $mimes:expr, $markers:expr) => {
353 Descriptor {
354 short_name: $short_name,
355 long_name: $long_name,
356 extensions: $exts,
357 mime_types: $mimes,
358 markers: $markers,
359 score: Self::score,
360 inst: Instantiate::Format(|source, opt| Ok(Box::new(Self::try_new(source, &opt)?))),
361 }
362 };
363}
364
365/// Convenience macro for declaring a probe `Descriptor` for a `MetadataReader`.
366#[macro_export]
367macro_rules! support_metadata {
368 ($short_name:expr, $long_name:expr, $exts:expr, $mimes:expr, $markers:expr) => {
369 Descriptor {
370 short_name: $short_name,
371 long_name: $long_name,
372 extensions: $exts,
373 mime_types: $mimes,
374 markers: $markers,
375 score: Self::score,
376 inst: Instantiate::Metadata(|opt| Box::new(Self::new(&opt))),
377 }
378 };
379}