From cf1cb04af05e75fbb99292b9c9aed450704ccc82 Mon Sep 17 00:00:00 2001 From: Zaggy1024 Date: Sat, 24 Jun 2023 03:21:32 -0500 Subject: [PATCH] LibVideo/Matroska: Don't choke on files containing CRC32 elements The EBML specification allows for CRC32 elements to be placed as the first child element of a master element. However, our parsing of master elements didn't take that into account, so an error would be thrown. Instead of erroring out, the `parse_master_element()` function will now skip CRC32 elements that are found as the first child of a master element. If it is found after the first child, that will be considered an error. Void elements will also be skipped by `parse_master_element()`. Since the `parse_cluster()` function has to seek the stream back to the cluster's first child in order to allow cues' positions to be used correctly, `parse_master_element()` had to be changed to return the first element position, since the callback is not invoked for CRC32 elements. This means that the parameter used to communicate the element position to the child element parsing function is unused, so that is removed. --- .../LibVideo/Containers/Matroska/Reader.cpp | 91 +++++++++++++------ 1 file changed, 64 insertions(+), 27 deletions(-) diff --git a/Userland/Libraries/LibVideo/Containers/Matroska/Reader.cpp b/Userland/Libraries/LibVideo/Containers/Matroska/Reader.cpp index 43a1ee64a8..9d6ca54a18 100644 --- a/Userland/Libraries/LibVideo/Containers/Matroska/Reader.cpp +++ b/Userland/Libraries/LibVideo/Containers/Matroska/Reader.cpp @@ -1,6 +1,6 @@ /* * Copyright (c) 2021, Hunter Salyer - * Copyright (c) 2022, Gregory Bertilson + * Copyright (c) 2022-2023, Gregory Bertilson * * SPDX-License-Identifier: BSD-2-Clause */ @@ -18,9 +18,14 @@ namespace Video::Matroska { #define TRY_READ(expression) DECODER_TRY(DecoderErrorCategory::Corrupted, expression) -// Elements IDs and types are listed at this URL: -// https://www.matroska.org/technical/elements.html +// RFC 8794 - Extensible Binary Meta Language +// https://datatracker.ietf.org/doc/html/rfc8794 constexpr u32 EBML_MASTER_ELEMENT_ID = 0x1A45DFA3; +constexpr u32 EBML_CRC32_ELEMENT_ID = 0xBF; +constexpr u32 EBML_VOID_ELEMENT_ID = 0xEC; + +// Matroska elements' IDs and types are listed at this URL: +// https://www.matroska.org/technical/elements.html constexpr u32 SEGMENT_ELEMENT_ID = 0x18538067; constexpr u32 DOCTYPE_ELEMENT_ID = 0x4282; constexpr u32 DOCTYPE_VERSION_ELEMENT_ID = 0x4287; @@ -98,35 +103,69 @@ DecoderErrorOr Reader::from_data(ReadonlyBytes data) return reader; } -static DecoderErrorOr parse_master_element(Streamer& streamer, [[maybe_unused]] StringView element_name, Function(u64, size_t)> element_consumer) +// Returns the position of the first element that is read from this master element. +static DecoderErrorOr parse_master_element(Streamer& streamer, [[maybe_unused]] StringView element_name, Function(u64)> element_consumer) { auto element_data_size = TRY_READ(streamer.read_variable_size_integer()); dbgln_if(MATROSKA_DEBUG, "{} has {} octets of data.", element_name, element_data_size); + bool first_element = true; + auto first_element_position = streamer.position(); + streamer.push_octets_read(); while (streamer.octets_read() < element_data_size) { dbgln_if(MATROSKA_TRACE_DEBUG, "====== Reading element ======"); auto element_id = TRY_READ(streamer.read_variable_size_integer(false)); - auto element_position = streamer.position(); dbgln_if(MATROSKA_TRACE_DEBUG, "{:s} element ID is {:#010x}", element_name, element_id); - auto result = element_consumer(element_id, element_position); + if (element_id == EBML_CRC32_ELEMENT_ID) { + // The CRC-32 Element contains a 32-bit Cyclic Redundancy Check value of all the + // Element Data of the Parent Element as stored except for the CRC-32 Element itself. + // When the CRC-32 Element is present, the CRC-32 Element MUST be the first ordered + // EBML Element within its Parent Element for easier reading. + if (!first_element) + return DecoderError::corrupted("CRC32 element must be the first child"sv); + + // All Top-Level Elements of an EBML Document that are Master Elements SHOULD include a + // CRC-32 Element as a Child Element. The CRC in use is the IEEE-CRC-32 algorithm as used + // in the [ISO3309] standard and in Section 8.1.1.6.2 of [ITU.V42], with initial value of + // 0xFFFFFFFF. The CRC value MUST be computed on a little-endian bytestream and MUST use + // little-endian storage. + + // FIXME: Currently we skip the CRC-32 Element instead of checking it. It may be worth + // verifying the contents of the SeekHead, Segment Info, and Tracks Elements. + // Note that Cluster Elements tend to be quite large, so verifying their integrity + // will result in longer buffering times in streamed contexts, so it may not be + // worth the effort checking those. It would also prevent error correction in + // video codecs from taking effect. + TRY_READ(streamer.read_unknown_element()); + continue; + } + if (element_id == EBML_VOID_ELEMENT_ID) { + // Used to void data or to avoid unexpected behaviors when using damaged data. + // The content is discarded. Also used to reserve space in a subelement for later use. + TRY_READ(streamer.read_unknown_element()); + continue; + } + + auto result = element_consumer(element_id); if (result.is_error()) return DecoderError::format(result.error().category(), "{} -> {}", element_name, result.error().description()); if (result.release_value() == IterationDecision::Break) break; dbgln_if(MATROSKA_TRACE_DEBUG, "Read {} octets of the {} so far.", streamer.octets_read(), element_name); + first_element = false; } streamer.pop_octets_read(); - return {}; + return first_element_position; } static DecoderErrorOr parse_ebml_header(Streamer& streamer) { EBMLHeader header; - TRY(parse_master_element(streamer, "Header"sv, [&](u64 element_id, size_t) -> DecoderErrorOr { + TRY(parse_master_element(streamer, "Header"sv, [&](u64 element_id) -> DecoderErrorOr { switch (element_id) { case DOCTYPE_ELEMENT_ID: header.doc_type = TRY_READ(streamer.read_string()); @@ -170,11 +209,11 @@ DecoderErrorOr Reader::parse_initial_data() static DecoderErrorOr parse_seek_head(Streamer& streamer, size_t base_position, HashMap& table) { - return parse_master_element(streamer, "SeekHead"sv, [&](u64 seek_head_child_id, size_t) -> DecoderErrorOr { + TRY(parse_master_element(streamer, "SeekHead"sv, [&](u64 seek_head_child_id) -> DecoderErrorOr { if (seek_head_child_id == SEEK_ELEMENT_ID) { Optional seek_id; Optional seek_position; - TRY(parse_master_element(streamer, "Seek"sv, [&](u64 seek_entry_child_id, size_t) -> DecoderErrorOr { + TRY(parse_master_element(streamer, "Seek"sv, [&](u64 seek_entry_child_id) -> DecoderErrorOr { switch (seek_entry_child_id) { case SEEK_ID_ELEMENT_ID: seek_id = TRY_READ(streamer.read_u64()); @@ -212,7 +251,8 @@ static DecoderErrorOr parse_seek_head(Streamer& streamer, size_t base_posi } return IterationDecision::Continue; - }); + })); + return {}; } DecoderErrorOr> Reader::find_first_top_level_element_with_id([[maybe_unused]] StringView element_name, u32 element_id) @@ -272,7 +312,7 @@ DecoderErrorOr> Reader::find_first_top_level_element_with_id([[ static DecoderErrorOr parse_information(Streamer& streamer) { SegmentInformation segment_information; - TRY(parse_master_element(streamer, "Segment Information"sv, [&](u64 element_id, size_t) -> DecoderErrorOr { + TRY(parse_master_element(streamer, "Segment Information"sv, [&](u64 element_id) -> DecoderErrorOr { switch (element_id) { case TIMESTAMP_SCALE_ID: segment_information.set_timestamp_scale(TRY_READ(streamer.read_u64())); @@ -331,7 +371,7 @@ static DecoderErrorOr parse_video_color_information(Str { TrackEntry::ColorFormat color_format {}; - TRY(parse_master_element(streamer, "Colour"sv, [&](u64 element_id, size_t) -> DecoderErrorOr { + TRY(parse_master_element(streamer, "Colour"sv, [&](u64 element_id) -> DecoderErrorOr { switch (element_id) { case PRIMARIES_ID: color_format.color_primaries = static_cast(TRY_READ(streamer.read_u64())); @@ -363,7 +403,7 @@ static DecoderErrorOr parse_video_track_information(Stre { TrackEntry::VideoTrack video_track {}; - TRY(parse_master_element(streamer, "VideoTrack"sv, [&](u64 element_id, size_t) -> DecoderErrorOr { + TRY(parse_master_element(streamer, "VideoTrack"sv, [&](u64 element_id) -> DecoderErrorOr { switch (element_id) { case PIXEL_WIDTH_ID: video_track.pixel_width = TRY_READ(streamer.read_u64()); @@ -390,7 +430,7 @@ static DecoderErrorOr parse_audio_track_information(Stre { TrackEntry::AudioTrack audio_track {}; - TRY(parse_master_element(streamer, "AudioTrack"sv, [&](u64 element_id, size_t) -> DecoderErrorOr { + TRY(parse_master_element(streamer, "AudioTrack"sv, [&](u64 element_id) -> DecoderErrorOr { switch (element_id) { case CHANNELS_ID: audio_track.channels = TRY_READ(streamer.read_u64()); @@ -413,7 +453,7 @@ static DecoderErrorOr parse_audio_track_information(Stre static DecoderErrorOr parse_track_entry(Streamer& streamer) { TrackEntry track_entry; - TRY(parse_master_element(streamer, "Track"sv, [&](u64 element_id, size_t) -> DecoderErrorOr { + TRY(parse_master_element(streamer, "Track"sv, [&](u64 element_id) -> DecoderErrorOr { switch (element_id) { case TRACK_NUMBER_ID: track_entry.set_track_number(TRY_READ(streamer.read_u64())); @@ -461,7 +501,7 @@ static DecoderErrorOr parse_track_entry(Streamer& streamer) DecoderErrorOr Reader::parse_tracks(Streamer& streamer) { - return parse_master_element(streamer, "Tracks"sv, [&](u64 element_id, size_t) -> DecoderErrorOr { + TRY(parse_master_element(streamer, "Tracks"sv, [&](u64 element_id) -> DecoderErrorOr { if (element_id == TRACK_ENTRY_ID) { auto track_entry = TRY(parse_track_entry(streamer)); dbgln_if(MATROSKA_DEBUG, "Parsed track {}", track_entry.track_number()); @@ -471,7 +511,8 @@ DecoderErrorOr Reader::parse_tracks(Streamer& streamer) } return IterationDecision::Continue; - }); + })); + return {}; } DecoderErrorOr Reader::for_each_track(TrackEntryCallback callback) @@ -517,12 +558,8 @@ constexpr size_t get_element_id_size(u32 element_id) static DecoderErrorOr parse_cluster(Streamer& streamer, u64 timestamp_scale) { Optional timestamp; - size_t first_element_position = 0; - - TRY(parse_master_element(streamer, "Cluster"sv, [&](u64 element_id, size_t position) -> DecoderErrorOr { - if (first_element_position == 0) - first_element_position = position - get_element_id_size(element_id); + auto first_element_position = TRY(parse_master_element(streamer, "Cluster"sv, [&](u64 element_id) -> DecoderErrorOr { switch (element_id) { case TIMESTAMP_ID: timestamp = TRY_READ(streamer.read_u64()); @@ -648,7 +685,7 @@ static DecoderErrorOr parse_cue_track_position(Streamer& strea bool had_cluster_position = false; - TRY_READ(parse_master_element(streamer, "CueTrackPositions"sv, [&](u64 element_id, size_t) -> DecoderErrorOr { + TRY_READ(parse_master_element(streamer, "CueTrackPositions"sv, [&](u64 element_id) -> DecoderErrorOr { switch (element_id) { case CUE_TRACK_ID: track_position.set_track_number(TRY_READ(streamer.read_u64())); @@ -692,7 +729,7 @@ static DecoderErrorOr parse_cue_point(Streamer& streamer, u64 timestam { CuePoint cue_point; - TRY(parse_master_element(streamer, "CuePoint"sv, [&](u64 element_id, size_t) -> DecoderErrorOr { + TRY(parse_master_element(streamer, "CuePoint"sv, [&](u64 element_id) -> DecoderErrorOr { switch (element_id) { case CUE_TIME_ID: { // On https://www.matroska.org/technical/elements.html, spec says of the CueTime element: @@ -735,7 +772,7 @@ DecoderErrorOr Reader::parse_cues(Streamer& streamer) { m_cues.clear(); - TRY(parse_master_element(streamer, "Cues"sv, [&](u64 element_id, size_t) -> DecoderErrorOr { + TRY(parse_master_element(streamer, "Cues"sv, [&](u64 element_id) -> DecoderErrorOr { switch (element_id) { case CUE_POINT_ID: { auto cue_point = TRY(parse_cue_point(streamer, TRY(segment_information()).timestamp_scale())); @@ -972,7 +1009,7 @@ ErrorOr Streamer::read_i16() ErrorOr Streamer::read_variable_size_integer(bool mask_length) { - dbgln_if(MATROSKA_TRACE_DEBUG, "Reading from offset {:p}", data()); + dbgln_if(MATROSKA_TRACE_DEBUG, "Reading VINT from offset {:p}", position()); auto length_descriptor = TRY(read_octet()); dbgln_if(MATROSKA_TRACE_DEBUG, "Reading VINT, first byte is {:#02x}", length_descriptor); if (length_descriptor == 0)